diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f1cffd332dfc4f1614ca63ed60f358acf59a74b..6a84745c1ffb1a892467286a2681aac6eb5a6c37 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,8 @@ cmake_minimum_required(VERSION 3.0.0) option(USE_OPENMP "openmp support" ON) -option(DEBUGING "enable debug mode" ON) -option(USE_EXCEPTION "use std exception" ON) +option(DEBUGING "enable debug mode" OFF) +option(USE_EXCEPTION "use std exception" OFF) option(SYMBOL_HIDDEN "symbol hidden" OFF) # on when use jni or ios io option(LOG_PROFILE "log profile" OFF) # select the platform to build @@ -247,6 +247,5 @@ elseif(FPGA) add_subdirectory(test) endif() - - +add_subdirectory(test) diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index c7ef09ed5a1466a7396ec9c177eb3c48abd91ad7..85fcc44a360a35e8100f6a8af6d0977fb577c7c0 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -30,7 +30,6 @@ limitations under the License. */ #ifdef PADDLE_EXECUTOR_MULTITHREAD #include -#include #include "common/threadpool.h" #endif @@ -96,13 +95,12 @@ Executor::Executor(const framework::Program p, int batch_size, } template -static void LoadMemInternal(void **data, framework::LoDTensor *tensor, - bool quant_uint8 = false) { +void LoadMemInternal(void **data, framework::LoDTensor *tensor) { char **data_buf = reinterpret_cast(data); int64_t size = tensor->numel(); Dtype *tensor_data = tensor->mutable_data(); - if (quant_uint8) { - // should be moved into operator init function + if (0) { + // TODO(hjchen2) should be moved into operator init function float min_value; float max_value; memory::Copy(&min_value, data_buf, sizeof(float)); @@ -158,8 +156,7 @@ void Executor::LoadMemory( // parse tensor from stream switch (tensor_desc.DataType()) { case framework::VARTYPE_TYPE_FP32: - LoadMemInternal(reinterpret_cast(data_buf), tensor, - program_.quantification); + LoadMemInternal(reinterpret_cast(data_buf), tensor); break; case framework::VARTYPE_TYPE_INT8: LoadMemInternal(reinterpret_cast(data_buf), tensor); @@ -266,6 +263,7 @@ std::shared_ptr Executor::Predict( framework::Variable *g_feed_value = program_.scope->Var("feed"); framework::Tensor *feed_tensor = g_feed_value->GetMutable(); + DLOG << "feed_tensor dim: " << feed_tensor->dims(); feed_tensor->Resize(t.dims()); feed_tensor->ShareDataWith(t); std::shared_ptr to_predict_block = @@ -300,7 +298,15 @@ std::shared_ptr Executor::Predict( for (int i = 0; i < profile.size(); i++) { const auto &pInfo = profile[i]; uint64_t timeCost = pInfo.runEnd - pInfo.runBegin; - _tp[ops[i]->Type()] += timeCost; + if (ops[i]->Type() == "conv2d") { + auto inputs = ops[i]->Inputs(); + auto *filter = framework::GetVarValue( + "Filter", inputs, *(program_.scope)); + int kernel_size = filter->dims()[2]; + _tp[ops[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost; + } else { + _tp[ops[i]->Type()] += timeCost; + } } printf("====================[ profile ]======================\n"); using prof_t = std::pair; @@ -370,6 +376,14 @@ std::shared_ptr Executor::PredictLod( for (int i = 0; i < profile.size(); i++) { const auto &pInfo = profile[i]; uint64_t timeCost = pInfo.runEnd - pInfo.runBegin; + if (ops[i]->Type() == "conv2d") { + auto inputs = ops[i]->Inputs(); + auto input_keys = ops[i]->GetInputKeys(); + auto *filter = framework::GetVarValue( + input_keys[1], inputs, *(program_.scope)); + int kernel_size = filter->dims()[2]; + printf("kernel size: %d\n", kernel_size); + } _tp[ops[i]->Type()] += timeCost; } printf("====================[ profile ]======================\n"); diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp index 93aaea4afd7026f792a007b337a35c2bde48ad48..9054dbdaadbb2f11356da1249b6ce6d8947f0d54 100644 --- a/src/operators/kernel/arm/conv_kernel.cpp +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -40,7 +40,8 @@ bool ConvKernel::Init(ConvParam *param) { param->Dilations()[0] == param->Dilations()[1] && param->Filter()->dims()[2] == 3 && param->Strides()[0] == 1 && param->Dilations()[0] == 1 && param->Output()->dims()[1] >= 16 && - param->Input()->dims()[2] >= 16) { + param->Input()->dims()[1] >= 16 && + param->Input()->dims()[2] <= 140 /* refered from ncnn */) { param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; // transform weight framework::Tensor *transformed_weight = new framework::Tensor; diff --git a/tools/toolchains/arm-android-neon.cmake b/tools/toolchains/arm-android-neon.cmake index 5e431059a974810b2fd0481e0942447f57bf1286..33a70d82bd78c15dd28ea7574d6df324f8cc64aa 100644 --- a/tools/toolchains/arm-android-neon.cmake +++ b/tools/toolchains/arm-android-neon.cmake @@ -3,3 +3,4 @@ set(ANDROID_PIE TRUE) set(ANDROID_STL "c++_static") set(ANDROID_PLATFORM "android-22") include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake") +#include("/Users/chenhoujiang/Project/android-ndk-r16b/build/cmake/android.toolchain.cmake")