From 099947cafd09a1cab142fccb8d5ebe9f76a0a1b4 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 14 Nov 2018 01:00:49 +0800 Subject: [PATCH] Add input channels consideration refered from ncnn --- CMakeLists.txt | 114 +++++------------------ src/framework/executor.cpp | 30 ++++-- src/operators/kernel/arm/conv_kernel.cpp | 3 +- tools/toolchains/arm-android-neon.cmake | 1 + 4 files changed, 49 insertions(+), 99 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f1cffd332..125af1cb6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,22 +1,15 @@ -cmake_minimum_required(VERSION 3.0.0) +cmake_minimum_required(VERSION 3.0) +project(paddle-mobile) -option(USE_OPENMP "openmp support" ON) -option(DEBUGING "enable debug mode" ON) -option(USE_EXCEPTION "use std exception" ON) -option(SYMBOL_HIDDEN "symbol hidden" OFF) # on when use jni or ios io -option(LOG_PROFILE "log profile" OFF) # select the platform to build -option(CPU "armv7 with neon" ON) -option(GPU_MALI "mali gpu" OFF) -option(GPU_CL "opencl gpu" OFF) -option(FPGA "fpga" OFF) -if(FPGA) - option(FPGAV1 "fpga v1" ON) - option(FPGAV2 "fpga v2" OFF) -endif() - +option(CPU "armv7 with neon support" ON) +option(MALI_GPU "mali gpu support" OFF) +option(FPGA "fpga support" OFF) -project(paddle-mobile) +option(USE_OPENMP "openmp support" ON) +option(DEBUGING "enable debug mode" OFF) +option(USE_EXCEPTION "use std exception" OFF) +option(LOG_PROFILE "log profile" ON) file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm) file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h) @@ -36,10 +29,10 @@ if(DEBUGING) message(STATUS "debugging mode") add_definitions(-DPADDLE_MOBILE_DEBUG) else() -endif() - -if(SYMBOL_HIDDEN) - add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden) + if(FPGA) + else() +# add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden) + endif() endif() if(USE_EXCEPTION) @@ -77,27 +70,7 @@ else() endforeach() endif() -if (GPU_CL) - add_definitions(-DPADDLE_MOBILE_CL) - - # opencl version - add_definitions(-DCL_TARGET_OPENCL_VERSION=220) - - link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL.so) - include_directories(third_party/opencl/OpenCL-Headers) -else() - file(GLOB_RECURSE _tmp_list src/framework/cl/*.cpp src/operators/kernel/cl/*.cpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - - file(GLOB_RECURSE _tmp_list_h src/framework/cl/*.h) - foreach(f ${_tmp_list_h}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() -endif() - -if (GPU_MALI) +if(MALI_GPU) add_definitions(-DPADDLE_MOBILE_MALI_GPU) add_definitions(-DUSE_ACL=1) add_definitions(-DUSE_OPENCL) @@ -123,43 +96,8 @@ else() endif() if(FPGA) + message("FPGA mode enabled") add_definitions(-DPADDLE_MOBILE_FPGA) - file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/kernel/fpga/*.cc) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list_h src/operators/math/*.h) - foreach(f ${_tmp_list_h}) - list(REMOVE_ITEM PADDLE_MOBILE_H ${f}) - endforeach() - list(APPEND PADDLE_MOBILE_CC src/operators/math/softmax.cpp) - list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h) - list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h) - if(FPGAV1) - message("FPGA_V1 enabled") - add_definitions(-DPADDLE_MOBILE_FPGA_V1) - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - endif() - if(FPGAV2) - message("FPGA_V2 enabled") - add_definitions(-DPADDLE_MOBILE_FPGA_V2) - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h) - foreach(f ${_tmp_list}) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) - endforeach() - endif() - else() file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc) foreach(f ${_tmp_list}) @@ -186,17 +124,17 @@ endif() if(ANDROID_NDK_TOOLCHAIN_INCLUDED) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog") else() - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp) + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp) list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h) endif() if(IS_IOS) else() - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h) - list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm) - list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h) -endif () + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h) + list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm) + list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h) +endif() set(CMAKE_VERBOSE_MAKEFILE ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) @@ -205,10 +143,8 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build) # NET default -if(FPGAV1) - set(NET "FPGA_NET_V1" CACHE STRING "select net type") -elseif(FPGAV2) - set(NET "FPGA_NET_V2" CACHE STRING "select net type") +if(FPGA) + set(NET "FPGAnets" CACHE STRING "select net type") else() set(NET "default" CACHE STRING "select net type") endif() @@ -247,6 +183,4 @@ elseif(FPGA) add_subdirectory(test) endif() - - - +add_subdirectory(test) diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index c7ef09ed5a..85fcc44a36 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -30,7 +30,6 @@ limitations under the License. */ #ifdef PADDLE_EXECUTOR_MULTITHREAD #include -#include #include "common/threadpool.h" #endif @@ -96,13 +95,12 @@ Executor::Executor(const framework::Program p, int batch_size, } template -static void LoadMemInternal(void **data, framework::LoDTensor *tensor, - bool quant_uint8 = false) { +void LoadMemInternal(void **data, framework::LoDTensor *tensor) { char **data_buf = reinterpret_cast(data); int64_t size = tensor->numel(); Dtype *tensor_data = tensor->mutable_data(); - if (quant_uint8) { - // should be moved into operator init function + if (0) { + // TODO(hjchen2) should be moved into operator init function float min_value; float max_value; memory::Copy(&min_value, data_buf, sizeof(float)); @@ -158,8 +156,7 @@ void Executor::LoadMemory( // parse tensor from stream switch (tensor_desc.DataType()) { case framework::VARTYPE_TYPE_FP32: - LoadMemInternal(reinterpret_cast(data_buf), tensor, - program_.quantification); + LoadMemInternal(reinterpret_cast(data_buf), tensor); break; case framework::VARTYPE_TYPE_INT8: LoadMemInternal(reinterpret_cast(data_buf), tensor); @@ -266,6 +263,7 @@ std::shared_ptr Executor::Predict( framework::Variable *g_feed_value = program_.scope->Var("feed"); framework::Tensor *feed_tensor = g_feed_value->GetMutable(); + DLOG << "feed_tensor dim: " << feed_tensor->dims(); feed_tensor->Resize(t.dims()); feed_tensor->ShareDataWith(t); std::shared_ptr to_predict_block = @@ -300,7 +298,15 @@ std::shared_ptr Executor::Predict( for (int i = 0; i < profile.size(); i++) { const auto &pInfo = profile[i]; uint64_t timeCost = pInfo.runEnd - pInfo.runBegin; - _tp[ops[i]->Type()] += timeCost; + if (ops[i]->Type() == "conv2d") { + auto inputs = ops[i]->Inputs(); + auto *filter = framework::GetVarValue( + "Filter", inputs, *(program_.scope)); + int kernel_size = filter->dims()[2]; + _tp[ops[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost; + } else { + _tp[ops[i]->Type()] += timeCost; + } } printf("====================[ profile ]======================\n"); using prof_t = std::pair; @@ -370,6 +376,14 @@ std::shared_ptr Executor::PredictLod( for (int i = 0; i < profile.size(); i++) { const auto &pInfo = profile[i]; uint64_t timeCost = pInfo.runEnd - pInfo.runBegin; + if (ops[i]->Type() == "conv2d") { + auto inputs = ops[i]->Inputs(); + auto input_keys = ops[i]->GetInputKeys(); + auto *filter = framework::GetVarValue( + input_keys[1], inputs, *(program_.scope)); + int kernel_size = filter->dims()[2]; + printf("kernel size: %d\n", kernel_size); + } _tp[ops[i]->Type()] += timeCost; } printf("====================[ profile ]======================\n"); diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp index 93aaea4afd..9054dbdaad 100644 --- a/src/operators/kernel/arm/conv_kernel.cpp +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -40,7 +40,8 @@ bool ConvKernel::Init(ConvParam *param) { param->Dilations()[0] == param->Dilations()[1] && param->Filter()->dims()[2] == 3 && param->Strides()[0] == 1 && param->Dilations()[0] == 1 && param->Output()->dims()[1] >= 16 && - param->Input()->dims()[2] >= 16) { + param->Input()->dims()[1] >= 16 && + param->Input()->dims()[2] <= 140 /* refered from ncnn */) { param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; // transform weight framework::Tensor *transformed_weight = new framework::Tensor; diff --git a/tools/toolchains/arm-android-neon.cmake b/tools/toolchains/arm-android-neon.cmake index 5e431059a9..33a70d82bd 100644 --- a/tools/toolchains/arm-android-neon.cmake +++ b/tools/toolchains/arm-android-neon.cmake @@ -3,3 +3,4 @@ set(ANDROID_PIE TRUE) set(ANDROID_STL "c++_static") set(ANDROID_PLATFORM "android-22") include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake") +#include("/Users/chenhoujiang/Project/android-ndk-r16b/build/cmake/android.toolchain.cmake") -- GitLab