diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake index 11a803ff031706a10f282f21024915be68444546..45be0e4d3a7e1b7daff3fd226b53b06ad96fb73d 100644 --- a/cmake/cross_compiling/android.cmake +++ b/cmake/cross_compiling/android.cmake @@ -18,6 +18,7 @@ endif() set(ANDROID TRUE) add_definitions(-DLITE_WITH_LINUX) +add_definitions(-DLITE_WITH_ANDROID) if(NOT DEFINED ANDROID_NDK) set(ANDROID_NDK $ENV{NDK_ROOT}) diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index 502b28d7b4c4e27276d9ac8880c9d46ee25191b1..1060602e12f5821a1c2f110d01a87d5fc6902704 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/api/cxx_api.h" +#include #include #include #include @@ -52,35 +53,36 @@ lite::Tensor *Predictor::GetInput(size_t offset) { } // get inputs names -std::vector Predictor::GetInputNames() { - std::vector input_names; - for (auto &item : input_names_) { - input_names.push_back(item.second); - } - return input_names; +const std::vector &Predictor::GetInputNames() { + return input_names_; } // get outputnames -std::vector Predictor::GetOutputNames() { - std::vector output_names; - for (auto &item : output_names_) { - output_names.push_back(item.second); - } - return output_names; +const std::vector &Predictor::GetOutputNames() { + return output_names_; } // append the names of inputs and outputs into input_names_ and output_names_ void Predictor::PrepareFeedFetch() { auto current_block = program_desc_.GetBlock(0); + std::vector feeds; + std::vector fetchs; for (int i = 0; i < current_block->OpsSize(); i++) { auto op = current_block->GetOp(i); if (op->Type() == "feed") { - int idx = op->GetAttr("col"); - input_names_[idx] = op->Output("Out").front(); - idx2feeds_[op->Output("Out").front()] = idx; + feeds.push_back(op); } else if (op->Type() == "fetch") { - int idx = op->GetAttr("col"); - output_names_[idx] = op->Input("X").front(); + fetchs.push_back(op); } } + input_names_.resize(feeds.size()); + output_names_.resize(fetchs.size()); + for (int i = 0; i < feeds.size(); i++) { + input_names_[feeds[i]->GetAttr("col")] = + feeds[i]->Output("Out").front(); + } + for (int i = 0; i < fetchs.size(); i++) { + output_names_[fetchs[i]->GetAttr("col")] = + fetchs[i]->Input("X").front(); + } } const lite::Tensor *Predictor::GetOutput(size_t offset) const { @@ -189,16 +191,17 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const { } // get input by name lite::Tensor *Predictor::GetInputByName(const std::string &name) { - if (idx2feeds_.find(name) == idx2feeds_.end()) { + auto element = std::find(input_names_.begin(), input_names_.end(), name); + if (element == input_names_.end()) { LOG(ERROR) << "Model do not have input named with: [" << name << "], model's inputs include:"; for (int i = 0; i < input_names_.size(); i++) { LOG(ERROR) << "[" << input_names_[i] << "]"; } - return NULL; + return nullptr; } else { - int idx = idx2feeds_[name]; - return GetInput(idx); + int position = std::distance(input_names_.begin(), element); + return GetInput(position); } } diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h index 3d8dc2f06aca24e23a77a0b32dc85a0959290758..7226f4767ddf91c2e8d9864e4bc7a7665845179a 100644 --- a/lite/api/cxx_api.h +++ b/lite/api/cxx_api.h @@ -74,8 +74,8 @@ class LITE_API Predictor { // get input by name. lite::Tensor* GetInputByName(const std::string& name); // get inputnames and get outputnames. - std::vector GetInputNames(); - std::vector GetOutputNames(); + const std::vector& GetInputNames(); + const std::vector& GetOutputNames(); void PrepareFeedFetch(); // Get offset-th col of fetch results. @@ -107,9 +107,8 @@ class LITE_API Predictor { const Scope* exec_scope_; std::unique_ptr program_; bool program_generated_{false}; - std::map input_names_; - std::map idx2feeds_; - std::map output_names_; + std::vector input_names_; + std::vector output_names_; }; /* diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index b4fb3828f3b9b38aa3bcefc1df05d6453d55e771..62984ea476a901828367d74874291080667df3d8 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -37,8 +37,8 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor { std::string GetVersion() const override; // get inputs names and get outputs names - std::vector GetInputNames() override; - std::vector GetOutputNames() override; + const std::vector &GetInputNames() override; + const std::vector &GetOutputNames() override; std::unique_ptr GetTensor( const std::string &name) const override; @@ -76,11 +76,11 @@ std::unique_ptr CxxPaddleApiImpl::GetOutput( return std::unique_ptr(new lite_api::Tensor(x)); } -std::vector CxxPaddleApiImpl::GetInputNames() { +const std::vector &CxxPaddleApiImpl::GetInputNames() { return raw_predictor_.GetInputNames(); } -std::vector CxxPaddleApiImpl::GetOutputNames() { +const std::vector &CxxPaddleApiImpl::GetOutputNames() { return raw_predictor_.GetOutputNames(); } diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc index 12963285e482b2ea6c6e761f430699507d45c0c5..d28081c5152024606eb2e453aae1c7ca9eb7cd07 100644 --- a/lite/api/light_api.cc +++ b/lite/api/light_api.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/api/light_api.h" +#include namespace paddle { namespace lite { @@ -56,16 +57,17 @@ Tensor* LightPredictor::GetInput(size_t offset) { // get input by name Tensor* LightPredictor::GetInputByName(const std::string& name) { - if (idx2feeds_.find(name) == idx2feeds_.end()) { + auto element = std::find(input_names_.begin(), input_names_.end(), name); + if (element == input_names_.end()) { LOG(ERROR) << "Model do not have input named with: [" << name << "], model's inputs include:"; for (int i = 0; i < input_names_.size(); i++) { LOG(ERROR) << "[" << input_names_[i] << "]"; } - return NULL; + return nullptr; } else { - int idx = idx2feeds_[name]; - return GetInput(idx); + int position = std::distance(input_names_.begin(), element); + return GetInput(position); } } @@ -79,35 +81,36 @@ const Tensor* LightPredictor::GetOutput(size_t offset) { return out_var->GetMutable(); } // get inputs names -std::vector LightPredictor::GetInputNames() { - std::vector input_names; - for (auto& item : input_names_) { - input_names.push_back(item.second); - } - return input_names; +const std::vector& LightPredictor::GetInputNames() { + return input_names_; } // get outputnames -std::vector LightPredictor::GetOutputNames() { - std::vector output_names; - for (auto& item : output_names_) { - output_names.push_back(item.second); - } - return output_names; +const std::vector& LightPredictor::GetOutputNames() { + return output_names_; } // append the names of inputs and outputs into input_names_ and output_names_ void LightPredictor::PrepareFeedFetch() { auto current_block = cpp_program_desc_.GetBlock(0); + std::vector feeds; + std::vector fetchs; for (int i = 0; i < current_block->OpsSize(); i++) { auto op = current_block->GetOp(i); if (op->Type() == "feed") { - int idx = op->GetAttr("col"); - input_names_[idx] = op->Output("Out").front(); - idx2feeds_[op->Output("Out").front()] = idx; + feeds.push_back(op); } else if (op->Type() == "fetch") { - int idx = op->GetAttr("col"); - output_names_[idx] = op->Input("X").front(); + fetchs.push_back(op); } } + input_names_.resize(feeds.size()); + output_names_.resize(fetchs.size()); + for (int i = 0; i < feeds.size(); i++) { + input_names_[feeds[i]->GetAttr("col")] = + feeds[i]->Output("Out").front(); + } + for (int i = 0; i < fetchs.size(); i++) { + output_names_[fetchs[i]->GetAttr("col")] = + fetchs[i]->Input("X").front(); + } } void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { diff --git a/lite/api/light_api.h b/lite/api/light_api.h index 0705e0aba42373dec9f1387573024c5b3bb98bbc..9d69cce441f86e563ad3ed0501514ab1fe79d98e 100644 --- a/lite/api/light_api.h +++ b/lite/api/light_api.h @@ -64,8 +64,8 @@ class LITE_API LightPredictor { } // get inputnames and get outputnames. - std::vector GetInputNames(); - std::vector GetOutputNames(); + const std::vector& GetInputNames(); + const std::vector& GetOutputNames(); void PrepareFeedFetch(); private: @@ -82,9 +82,8 @@ class LITE_API LightPredictor { std::shared_ptr scope_; std::unique_ptr program_; cpp::ProgramDesc cpp_program_desc_; - std::map input_names_; - std::map idx2feeds_; - std::map output_names_; + std::vector input_names_; + std::vector output_names_; }; } // namespace lite diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc index 90e1397d8338adb1ba732fc322ae03520bcce27f..70ab8ac0c03b8dea84da5ef1d6ca9c64c4c9d102 100644 --- a/lite/api/light_api_impl.cc +++ b/lite/api/light_api_impl.cc @@ -32,8 +32,8 @@ class LightPredictorImpl : public PaddlePredictor { void Run() override; std::string GetVersion() const override; - std::vector GetInputNames() override; - std::vector GetOutputNames() override; + const std::vector& GetInputNames() override; + const std::vector& GetOutputNames() override; std::unique_ptr GetTensor( const std::string& name) const override; @@ -78,11 +78,11 @@ std::unique_ptr LightPredictorImpl::GetInputByName( new Tensor(raw_predictor_->GetInputByName(name))); } -std::vector LightPredictorImpl::GetInputNames() { +const std::vector& LightPredictorImpl::GetInputNames() { return raw_predictor_->GetInputNames(); } -std::vector LightPredictorImpl::GetOutputNames() { +const std::vector& LightPredictorImpl::GetOutputNames() { return raw_predictor_->GetOutputNames(); } diff --git a/lite/api/light_api_test.cc b/lite/api/light_api_test.cc index 418d97e9e8814b5e6e90a76cbdb6e92677c9c726..d2bbc295ad4b68e7849d5d25f34e0b5117fc846d 100644 --- a/lite/api/light_api_test.cc +++ b/lite/api/light_api_test.cc @@ -36,12 +36,14 @@ TEST(LightAPI, load) { data[i] = i; } - std::vector inputs = predictor.GetInputNames(); + predictor.PrepareFeedFetch(); + const std::vector& inputs = predictor.GetInputNames(); + LOG(INFO) << "input size: " << inputs.size(); for (int i = 0; i < inputs.size(); i++) { LOG(INFO) << "inputnames: " << inputs[i]; } - std::vector outputs = predictor.GetOutputNames(); + const std::vector& outputs = predictor.GetOutputNames(); for (int i = 0; i < outputs.size(); i++) { LOG(INFO) << "outputnames: " << outputs[i]; } diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index 545ae03f6725de7649b3278835bda973ade2755e..d7e3c014b0fe37a5f1da4210972349ac4124ed6b 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -75,9 +75,9 @@ class LITE_API PaddlePredictor { virtual std::string GetVersion() const = 0; // Get input names - virtual std::vector GetInputNames() = 0; + virtual const std::vector& GetInputNames() = 0; // Get output names - virtual std::vector GetOutputNames() = 0; + virtual const std::vector& GetOutputNames() = 0; // Get Input by name virtual std::unique_ptr GetInputByName(const std::string& name) = 0; diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc index 63142d49814473e6dc9ee6e553d95fa86b4058c5..443a05d9927cfa461a306ce6c3c32ff6e5024631 100644 --- a/lite/api/paddle_api_test.cc +++ b/lite/api/paddle_api_test.cc @@ -37,12 +37,12 @@ TEST(CxxApi, run) { LOG(INFO) << "Version: " << predictor->GetVersion(); - std::vector inputs = predictor->GetInputNames(); + auto& inputs = predictor->GetInputNames(); LOG(INFO) << "input size: " << inputs.size(); for (int i = 0; i < inputs.size(); i++) { LOG(INFO) << "inputnames: " << inputs[i]; } - std::vector outputs = predictor->GetOutputNames(); + auto& outputs = predictor->GetOutputNames(); for (int i = 0; i < outputs.size(); i++) { LOG(INFO) << "outputnames: " << outputs[i]; } @@ -76,14 +76,14 @@ TEST(LightApi, run) { auto predictor = lite_api::CreatePaddlePredictor(config); - std::vector inputs = predictor->GetInputNames(); + auto& inputs = predictor->GetInputNames(); LOG(INFO) << "input size: " << inputs.size(); for (int i = 0; i < inputs.size(); i++) { - LOG(INFO) << "inputnames: " << inputs[i]; + LOG(INFO) << "inputnames: " << inputs.at(i); } - std::vector outputs = predictor->GetOutputNames(); + auto& outputs = predictor->GetOutputNames(); for (int i = 0; i < outputs.size(); i++) { - LOG(INFO) << "outputnames: " << outputs[i]; + LOG(INFO) << "outputnames: " << outputs.at(i); } LOG(INFO) << "Version: " << predictor->GetVersion(); diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index c150b2b1776a7978821286c3ca4e311e75c251a3..896f6c8d33a8665c4c94786dd08af1a097942608 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -35,6 +35,9 @@ #include #include #endif +#ifdef LITE_WITH_ANDROID +#include +#endif #if __APPLE__ #include "TargetConditionals.h" #if LITE_WITH_IPHONE @@ -218,6 +221,7 @@ void get_cpu_arch(std::vector* archs, const int cpu_num) { #ifdef LITE_WITH_LINUX std::string get_cpu_name() { + std::string cpu_name; FILE* fp = fopen("/proc/cpuinfo", "rb"); if (!fp) { return ""; @@ -229,12 +233,23 @@ std::string get_cpu_name() { break; } if (strstr(line, "Hardware") != NULL) { - fclose(fp); - return std::string(line); + cpu_name = std::string(line); } } +#ifdef LITE_WITH_ANDROID + // cpu name concat board name, platform name and chip name + char board_name[128]; + char platform_name[128]; + char chip_name[128]; + __system_property_get("ro.product.board", board_name); + __system_property_get("ro.board.platform", platform_name); + __system_property_get("ro.chipname", chip_name); + cpu_name = + cpu_name + "_" + board_name + "_" + platform_name + "_" + chip_name; +#endif + std::transform(cpu_name.begin(), cpu_name.end(), cpu_name.begin(), ::toupper); fclose(fp); - return ""; + return cpu_name; } int get_min_freq_khz(int cpuid) { @@ -780,7 +795,9 @@ bool DeviceInfo::SetCPUInfoByName() { cluster_ids_ = {0, 0, 0, 0}; SetArchInfo(1, kA53); return true; - } else if (dev_name_.find("KIRIN980") != std::string::npos) { // Kirin 980 + } else if (dev_name_.find("KIRIN980") != std::string::npos || + dev_name_.find("KIRIN990") != + std::string::npos) { // Kirin 980, Kirin 990 core_num_ = 8; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; big_core_ids_ = {4, 5, 6, 7}; @@ -1109,7 +1126,8 @@ void DeviceInfo::SetCache(int l1size, int l2size, int l3size) { } bool DeviceInfo::ExtendWorkspace(size_t size) { - workspace_.Resize({size + llc_size()}); + workspace_.Resize( + {static_cast(size + static_cast(llc_size()))}); return workspace_.mutable_data() != nullptr; } diff --git a/lite/core/types.cc b/lite/core/types.cc index ec89e83e5808fb85803adea0555c76b7e424424c..4ea383333d519ac2c481dce459ca49124a64df32 100644 --- a/lite/core/types.cc +++ b/lite/core/types.cc @@ -82,6 +82,10 @@ Type StdTypeToRepr() { return Type::_float64; } template <> +Type StdTypeToRepr>() { + return Type::_char_list; +} +template <> Type StdTypeToRepr() { return Type::_string; } diff --git a/lite/core/types.h b/lite/core/types.h index efb8a096e569e296f71737ca3d26cb77eb71e62c..8f154f9dd509d3627750ecbf301923a2296252d1 100644 --- a/lite/core/types.h +++ b/lite/core/types.h @@ -16,6 +16,7 @@ #include #include +#include #include "lite/api/paddle_place.h" #include "lite/utils/all.h" @@ -36,7 +37,9 @@ enum class Type { _float64, _bool, _string, - // primary list types + // primary list type + _char_list, + // list types _list, // enum type _enum, @@ -89,6 +92,8 @@ Type StdTypeToRepr(); template <> Type StdTypeToRepr(); template <> +Type StdTypeToRepr>(); +template <> Type StdTypeToRepr(); // Factors that impact the kernel picking strategy. Multiple factors can be diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index 98007db0d188b8a77477a5148224be71f5b00dd5..ebb96e21d5e856325b7abdb8342df2aea3d5b5c3 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -39,6 +39,13 @@ void ConvCompute::PrepareForRun() { int pad = param.paddings[0]; int stride = param.strides[0]; + int chin = param.x->dims()[1]; + int hin = param.x->dims()[2]; + int win = param.x->dims()[3]; + int chout = param.output->dims()[1]; + int hout = param.output->dims()[2]; + int wout = param.output->dims()[3]; + bool kps_equal = (param.paddings[0] == param.paddings[1]) && (param.strides[0] == param.strides[1]) && (kw == kh); bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1); @@ -54,7 +61,7 @@ void ConvCompute::PrepareForRun() { VLOG(3) << "invoking dw conv"; } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal && no_dilation) { - if (ic >= 32 && oc >= 32) { + if (ic >= 32 && oc >= 32 && hout > 16 && wout > 16) { /// winograd conv impl impl_ = new WinogradConv; VLOG(3) << "invoking winograd conv"; @@ -63,8 +70,8 @@ void ConvCompute::PrepareForRun() { impl_ = new DirectConv; VLOG(3) << "invoking direct conv"; } - } else if (param.groups == 1 && kw == 3 && stride == 2 && kps_equal && - no_dilation) { + } else if (param.groups == 1 && kw == 3 && stride == 2 && + chin * chout < 4 * hin * win && kps_equal && no_dilation) { /// direct conv impl impl_ = new DirectConv; VLOG(3) << "invoking direct conv"; diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt index d855ee8e36b8babc40e4820ccd2b19d0b1008d34..67f55881ce4010d1179d9b6013aa560c56dd949e 100644 --- a/lite/kernels/cuda/CMakeLists.txt +++ b/lite/kernels/cuda/CMakeLists.txt @@ -32,6 +32,8 @@ nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_c nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda) nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda) nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda) +nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda) +nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda) #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda) nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda ) diff --git a/lite/kernels/cuda/pool_compute_test.cc b/lite/kernels/cuda/pool_compute_test.cc index fafd1ef0c8d449c84c417023fbb81e8d7c3bb43f..fe6ff92c0ce943cad36fbdd4f1408e344d9fd5fd 100644 --- a/lite/kernels/cuda/pool_compute_test.cc +++ b/lite/kernels/cuda/pool_compute_test.cc @@ -194,9 +194,9 @@ TEST(pool_cuda, compute) { for (auto stride : {1, 2}) { for (auto pad : {0, 1}) { for (auto n : {1, 2}) { - for (auto c : {1, 3, 256}) { - for (auto h : {2, 3, 4, 6, 13}) { - for (auto w : {2, 3, 4, 6, 13}) { + for (auto c : {1, 3}) { + for (auto h : {2, 3, 4, 11}) { + for (auto w : {2, 3, 4, 11}) { VLOG(3) << "n:" << n << " c:" << c << " h:" << h << " w:" << w << " ksize:" << ksize << " stride:" << stride << " pad:" << pad diff --git a/lite/kernels/cuda/softmax_compute_test.cc b/lite/kernels/cuda/softmax_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b4d53520911a4868c73d7806fcc1bb5bf8bf33df --- /dev/null +++ b/lite/kernels/cuda/softmax_compute_test.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/softmax_compute.h" +#include +#include +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +using Tensor = lite::Tensor; +using DDim = lite::DDim; + +template +static void softmax_compute_ref(const operators::SoftmaxParam& param) { + const dtype* x_data = param.x->mutable_data(); + dtype* output_data = param.output->mutable_data(); + DDim x_dims = param.x->dims(); + ASSERT_EQ(x_dims.data(), param.output->dims().data()); + auto x_rank = x_dims.size(); + int axis = param.axis; + if (axis < 0) { + axis += x_rank; + } + int axis_size = x_dims[axis]; + int outer_num = x_dims.Slice(0, axis).production(); + int inner_num = x_dims.Slice(axis + 1, x_rank).production(); + int compute_size = outer_num * inner_num; + for (int i = 0; i < compute_size; i++) { + int idx_inner = i % inner_num; + int idx_outer = (i / inner_num) * axis_size; + int start = idx_outer * inner_num + idx_inner; + int offset; + + offset = start; + dtype max_data = std::numeric_limits::lowest(); + for (int j = 0; j < axis_size; j++) { + max_data = x_data[offset] > max_data ? x_data[offset] : max_data; + offset += inner_num; + } + + offset = start; + dtype sum_data = (dtype)0; + for (int j = 0; j < axis_size; j++) { + output_data[offset] = exp(x_data[offset] - max_data); + sum_data += output_data[offset]; + offset += inner_num; + } + + offset = start; + for (int j = 0; j < axis_size; j++) { + output_data[offset] /= sum_data; + offset += inner_num; + } + } +} + +TEST(softmax_cuda, compute) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + SoftmaxCompute softmax; + operators::SoftmaxParam param; + softmax.SetContext(std::move(ctx)); + lite::Tensor x; + lite::Tensor x_cpu; + lite::Tensor output; + lite::Tensor output_cpu; + lite::Tensor output_ref; + for (auto n : {1, 3}) { + for (auto c : {1, 4}) { + for (auto h : {5, 1, 112}) { + for (auto w : {1, 6, 112}) { + for (auto axis : {-2, -1, 0, 1, 2}) { + x.Resize({n, c, h, w}); + x_cpu.Resize({n, c, h, w}); + output.Resize({n, c, h, w}); + output_cpu.Resize({n, c, h, w}); + output_ref.Resize({n, c, h, w}); + auto* x_cpu_data = x_cpu.mutable_data(); + auto* output_data = output.mutable_data(TARGET(kCUDA)); + auto* output_cpu_data = output_ref.mutable_data(); + auto* output_ref_data = output_ref.mutable_data(); + for (int i = 0; i < x.dims().production(); i++) { + x_cpu_data[i] = i; + } + x.Assign(x_cpu_data, + x_cpu.dims()); + param.x = &x; + param.axis = axis; + param.output = &output; + softmax.SetParam(param); + softmax.Launch(); + param.x = &x_cpu; + param.output = &output_ref; + softmax_compute_ref(param); + cudaDeviceSynchronize(); + CopySync(output_cpu_data, + output_data, + sizeof(float) * output.numel(), + IoDirection::DtoH); + for (int i = 0; i < output.dims().production(); i++) { + EXPECT_NEAR(output_cpu_data[i], output_ref_data[i], 1e-5); + } + } + } + } + } + } +} +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/yolo_box_compute_test.cc b/lite/kernels/cuda/yolo_box_compute_test.cc index 26b890c9f127a49fff42f102e7bbdab3fffb042a..994251b249e7dc6d8ae8870937c34cfa0323fd22 100644 --- a/lite/kernels/cuda/yolo_box_compute_test.cc +++ b/lite/kernels/cuda/yolo_box_compute_test.cc @@ -89,7 +89,7 @@ inline static void calc_label_score(float* scores, template static void YoloBoxRef(const T* input, - const T* imgsize, + const int* imgsize, T* boxes, T* scores, const float conf_thresh, @@ -106,8 +106,8 @@ static void YoloBoxRef(const T* input, float box[4]; for (int i = 0; i < n; i++) { - int img_height = static_cast(imgsize[2 * i]); - int img_width = static_cast(imgsize[2 * i + 1]); + int img_height = imgsize[2 * i]; + int img_width = imgsize[2 * i + 1]; for (int j = 0; j < an_num; j++) { for (int k = 0; k < h; k++) { @@ -184,12 +184,12 @@ TEST(yolo_box, normal) { auto* scores_data = scores.mutable_data(TARGET(kCUDA)); float* x_cpu_data = x_cpu.mutable_data(); - float* sz_cpu_data = sz_cpu.mutable_data(); + int* sz_cpu_data = sz_cpu.mutable_data(); float* boxes_cpu_data = boxes_cpu.mutable_data(); float* scores_cpu_data = scores_cpu.mutable_data(); float* x_ref_data = x_ref.mutable_data(); - float* sz_ref_data = sz_ref.mutable_data(); + int* sz_ref_data = sz_ref.mutable_data(); float* boxes_ref_data = boxes_ref.mutable_data(); float* scores_ref_data = scores_ref.mutable_data(); @@ -203,7 +203,7 @@ TEST(yolo_box, normal) { sz_ref_data[1] = 32; x.Assign(x_cpu_data, x_cpu.dims()); - sz.Assign(sz_cpu_data, sz_cpu.dims()); + sz.Assign(sz_cpu_data, sz_cpu.dims()); param.X = &x; param.ImgSize = &sz; diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc index fd82350ddd27e9ae4386a02428c6cc097ab2fe1d..13b6cb5b77d00a2a5f733a0015dec4dbebc088b7 100644 --- a/lite/model_parser/model_parser.cc +++ b/lite/model_parser/model_parser.cc @@ -727,10 +727,8 @@ void LoadModelNaiveFromMemory(const std::string &model_buffer, // Load model - std::string prog_path = model_buffer; - naive_buffer::BinaryTable table; - table.LoadFromMemory(prog_path.c_str(), prog_path.length()); + table.LoadFromMemory(model_buffer.c_str(), model_buffer.length()); naive_buffer::proto::ProgramDesc nb_proto_prog(&table); nb_proto_prog.Load(); @@ -742,8 +740,7 @@ void LoadModelNaiveFromMemory(const std::string &model_buffer, // Load Params // NOTE: Only main block be used now. // only combined Params are supported in Loading Model from memory - std::string combined_params_path = param_buffer; - LoadCombinedParamsNaive(combined_params_path, scope, *cpp_prog, true); + LoadCombinedParamsNaive(param_buffer, scope, *cpp_prog, true); VLOG(4) << "Load model from naive buffer memory successfully"; } diff --git a/lite/model_parser/naive_buffer/naive_buffer.h b/lite/model_parser/naive_buffer/naive_buffer.h index e2e2f7fb1ea3cb5b226bf09bd16074f51e171c75..717dd3c5a6b0c48d6a1f2ae0d7dba9f08a6d99f3 100644 --- a/lite/model_parser/naive_buffer/naive_buffer.h +++ b/lite/model_parser/naive_buffer/naive_buffer.h @@ -126,6 +126,41 @@ using UInt64Builder = PrimaryBuilder; using Float32Builder = PrimaryBuilder; using Float64Builder = PrimaryBuilder; +template +class PrimaryListBuilder : public FieldBuilder { + std::vector data_; + + public: + using value_type = Primary; + + explicit PrimaryListBuilder(BinaryTable* table) : FieldBuilder(table) {} + PrimaryListBuilder(BinaryTable* table, const std::vector& val) + : FieldBuilder(table), data_(val) {} + + /// Set data. + void set(const std::vector& x) { data_ = x; } + + const std::vector& data() const { return data_; } + + /// Save information to the corresponding BinaryTable. + void Save() override; + + /// Load information from the corresponding BinaryTable. + void Load() override; + + /// Number of elements. + size_t size() const { return data_.size(); } + + Type type() const override { + return core::StdTypeToRepr>(); + } + + /// clear builder + void Clear() { data_.clear(); } + + ~PrimaryListBuilder() = default; +}; + /* * Builder for all the primary types. int32, float, bool and so on. */ @@ -344,6 +379,36 @@ void PrimaryBuilder::Load() { table()->Consume(sizeof(value_type)); } +template +void PrimaryListBuilder::Load() { + CHECK(data_.empty()) << "Duplicate load"; + // Load number of elements first. + uint64_t num_elems{}; + memcpy(&num_elems, table()->cursor(), sizeof(uint64_t)); + table()->Consume(sizeof(uint64_t)); + + data_.resize(num_elems); + for (uint64_t i = 0; i < num_elems; i++) { + memcpy(&data_[i], table()->cursor(), sizeof(value_type)); + table()->Consume(sizeof(value_type)); + } +} + +template +void PrimaryListBuilder::Save() { + // store number of elements in the head. + uint64_t num_elems = size(); + table()->Require(sizeof(uint64_t)); + memcpy(table()->cursor(), &num_elems, sizeof(uint64_t)); + table()->Consume(sizeof(uint64_t)); + + table()->Require(num_elems * sizeof(value_type)); + memcpy(table()->cursor(), + reinterpret_cast(&data_[0]), + num_elems * sizeof(value_type)); + table()->Consume(num_elems * sizeof(value_type)); +} + template void EnumBuilder::Save() { value_type holder = static_cast(data_); diff --git a/lite/model_parser/naive_buffer/param_desc.cc b/lite/model_parser/naive_buffer/param_desc.cc index d7e2b4caec062988e8f20486402a05f7b7c27143..4397b3c413e8a09d2e5e5b41b8f9222bcfab4e20 100644 --- a/lite/model_parser/naive_buffer/param_desc.cc +++ b/lite/model_parser/naive_buffer/param_desc.cc @@ -149,15 +149,16 @@ void ParamDesc::SetDim(const std::vector& dim) { CHECK(GetDataType() == VarDescAPI::VarDataType::type__) \ << "Data Type mismatch"; \ std::vector res; \ - auto& data_builder = desc_->GetField>("data"); \ - auto data = RepeatedToVector(data_builder); \ + auto& data_builder = desc_->GetField>("data"); \ + auto& data = data_builder.data(); \ size_t size = data.size() / sizeof(T); \ - auto* data_ptr = reinterpret_cast(&data[0]); \ + auto* data_ptr = reinterpret_cast(&data[0]); \ for (size_t i = 0; i < size; ++i) { \ res.push_back(data_ptr[i]); \ } \ return res; \ } + GET_DATA_IMPL(uint8_t, UINT8); GET_DATA_IMPL(int8_t, INT8); GET_DATA_IMPL(int16_t, INT16); @@ -172,14 +173,13 @@ GET_DATA_IMPL(double, FP64); CHECK(GetDataType() == VarDescAPI::VarDataType::type__) \ << "Data Type mismatch, call SetDataType first."; \ auto* data_builder = \ - desc_->GetMutableField>("data"); \ + desc_->GetMutableField>("data"); \ CHECK(data_builder); \ data_builder->Clear(); \ size_t size = size__ * sizeof(T); \ auto* data_ptr = reinterpret_cast(data_ptr__); \ - for (size_t i = 0; i < size; ++i) { \ - data_builder->New()->set(data_ptr[i]); \ - } + std::vector data_vec(data_ptr, data_ptr + size); \ + data_builder->set(data_vec); #define SET_DATA_IMPL(T, type__) \ template <> \ diff --git a/lite/model_parser/naive_buffer/proto/framework.nb.h b/lite/model_parser/naive_buffer/proto/framework.nb.h index f495a12b460c57e2464a76409d69778f4e2754a8..2427e49d2690811ded0a19d7a7bd6dec1ef6394a 100644 --- a/lite/model_parser/naive_buffer/proto/framework.nb.h +++ b/lite/model_parser/naive_buffer/proto/framework.nb.h @@ -191,7 +191,7 @@ class ParamDesc : public StructBuilder { New("lod"); NewUInt32("tensor_version"); New("tensor_desc"); - New>("data"); + New>("data"); } }; diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc index 10dff5371a0f6840e092287d97eff98722e3b7f7..668419cf7ceae4a2e10cd447d57824f826cabd3a 100644 --- a/lite/operators/conv_op.cc +++ b/lite/operators/conv_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/operators/conv_op.h" +#include #include #include "lite/core/op_registry.h" @@ -51,10 +52,41 @@ inline int ConvOutputSize( return output_size; } +inline void UpdatePaddingAndDilation(std::vector* paddings, + std::vector* dilations, + const std::vector& strides, + const std::string padding_algorithm, + const lite::DDim data_dims, + const lite::DDim& ksize) { + // when padding_desc is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (size_t i = 0; i < strides.size(); ++i) { + int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; + int pad_sum = + std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2], + (int64_t)0); + // pad + *(paddings->begin() + i) = pad_sum / 2; + // dilation + *(dilations->begin() + i) = 1; + } + } else if (padding_algorithm == "VALID") { + for (auto& it : *paddings) { + it = 0; + } + } +} + bool ConvOpLite::InferShape() const { const auto in_dims = param_.x->dims(); const auto filter_dims = param_.filter->dims(); + UpdatePaddingAndDilation(¶m_.paddings, + ¶m_.dilations, + param_.strides, + padding_algorithm_, + in_dims, + filter_dims); std::vector output_shape({in_dims[0], filter_dims[0]}); for (size_t i = 0; i < param_.strides.size(); ++i) { output_shape.push_back(ConvOutputSize(in_dims[i + 2], diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h index ac0006c8e6f495d36991cf712c3c80dfcf7a46c9..1d6e1c93490a394723d34de76fc3ff8040d31e81 100644 --- a/lite/operators/conv_op.h +++ b/lite/operators/conv_op.h @@ -93,6 +93,10 @@ class ConvOpLite : public OpLite { << "The fused conv only supports fuse with relu and leaky relu"; } } + + if (op_desc.HasAttr("padding_algorithm")) { + padding_algorithm_ = op_desc.GetAttr("padding_algorithm"); + } // For Int8 if (op_desc.HasAttr("enable_int8")) { param_.enable_int8 = op_desc.GetAttr("enable_int8"); @@ -114,6 +118,7 @@ class ConvOpLite : public OpLite { private: mutable ConvParam param_; + std::string padding_algorithm_{""}; }; } // namespace operators