diff --git a/CMakeLists.txt b/CMakeLists.txt index f9e7eb2fe8cc9e731d42db2407a04f300e583b1b..8438ad53b026df19beb9f49becee379a2e45a795 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,9 @@ cmake_minimum_required(VERSION 3.6) +option(USE_OPENMP "openmp support" ON) + project(paddle-mobile) option(DEBUGING "enable debug mode" OFF) -option(USE_OPENMP "openmp support" OFF) option(USE_EXCEPTION "use std exception" OFF) option(LOG_PROFILE "log profile" OFF) # select the platform to build @@ -149,7 +150,17 @@ if (ANDROID_NDK_TOOLCHAIN_INCLUDED) list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS) add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) elseif(IS_IOS) - add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) + if(USE_OPENMP) + add_library(paddle-mobile-stage0 STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) + add_custom_target(paddle-mobile ALL + COMMAND libtool -static -o ${CMAKE_BINARY_DIR}/libpaddle-mobile.a ${CMAKE_CURRENT_LIST_DIR}/tools/libomp.a $ + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + DEPENDS paddle-mobile + ) + add_dependencies(paddle-mobile paddle-mobile-stage0) + else() + add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) + endif() else () add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H}) endif () diff --git a/src/ios_io/PaddleMobile.mm b/src/ios_io/PaddleMobile.mm index 9298e7907709bbbed77b3d4d76528689cae7bd93..5c7b801be0ea7967ea0c94813325d41071bb890b 100644 --- a/src/ios_io/PaddleMobile.mm +++ b/src/ios_io/PaddleMobile.mm @@ -58,6 +58,7 @@ static std::mutex shared_mutex; - (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath{ std::string model_path_str = std::string([modelPath UTF8String]); std::string weights_path_str = std::string([weighsPath UTF8String]); + pam_->SetThreadNum(2); if (loaded_ = pam_->Load(model_path_str, weights_path_str, true)) { return YES; } else { diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index 04a76465b834bad7553870e0a1eb0e5d47cdd71f..e5dc220f4c50922c2918251720619de8b4df7b98 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -3002,14 +3002,21 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda, NC = L2 / (KC * sizeof(float)); // make sure MC is multiple of MR, and NC is multiple of NR - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; + if (MC == 0) { + MC = MR; + } else { + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + MR - 1) / MR * MR; + } // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; + if (NC == 0) { + NC = NR; + } else { + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + NR - 1) / NR * NR; + } // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; packedA = static_cast( @@ -3067,14 +3074,21 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda, NC = L2 / (KC * sizeof(float)); // make sure MC is multiple of MR, and NC is multiple of NR - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; + if (MC == 0) { + MC = MR; + } else { + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + MR - 1) / MR * MR; + } // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; + if (NC == 0) { + NC = NR; + } else { + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + NR - 1) / NR * NR; + } // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; packedA = static_cast( @@ -3133,14 +3147,21 @@ void SgemmWithPRelu(int m, int n, int k, const float *A, int lda, NC = L2 / (KC * sizeof(float)); // make sure MC is multiple of MR, and NC is multiple of NR - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; + if (MC == 0) { + MC = MR; + } else { + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + MR - 1) / MR * MR; + } // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; + if (NC == 0) { + NC = NR; + } else { + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + NR - 1) / NR * NR; + } // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; packedA = static_cast( @@ -3203,9 +3224,13 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, if (m > n) { // 对 A 分块 MC = L1 / (KC * sizeof(float)); - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; + if (MC == 0) { + MC = MR; + } else { + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + MR - 1) / MR * MR; + } // 补齐 B NC = (n + NR - 1) / NR * NR; @@ -3227,9 +3252,13 @@ void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda, } else { // 对 B 分块 NC = L1 / (KC * sizeof(float)); - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; + if (NC == 0) { + NC = NR; + } else { + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + NR - 1) / NR * NR; + } // 补齐 A MC = (m + MR - 1) / MR * MR; @@ -3311,9 +3340,13 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, if (m > n) { // 对 A 分块 MC = L1 / (KC * sizeof(float)); - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; + if (MC == 0) { + MC = MR; + } else { + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + MR - 1) / MR * MR; + } // 补齐 B NC = (n + NR - 1) / NR * NR; @@ -3335,9 +3368,13 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda, } else { // 对 B 分块 NC = L1 / (KC * sizeof(float)); - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; + if (NC == 0) { + NC == NR; + } else { + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + NR - 1) / NR * NR; + } // 补齐 A MC = (m + MR - 1) / MR * MR; @@ -3430,9 +3467,13 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, if (m > n) { // 对 A 分块 MC = L1 / (KC * sizeof(float)); - int mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR - 1) / MR * MR; + if (MC == 0) { + MC = MR; + } else { + int mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + MR - 1) / MR * MR; + } // 补齐 B NC = (n + NR - 1) / NR * NR; @@ -3454,9 +3495,13 @@ void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda, } else { // 对 B 分块 NC = L1 / (KC * sizeof(float)); - int nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR - 1) / NR * NR; + if (NC == 0) { + NC = NR; + } else { + int nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + NR - 1) / NR * NR; + } // 补齐 A MC = (m + MR - 1) / MR * MR; diff --git a/src/operators/math/math_function.cpp b/src/operators/math/math_function.cpp index 1ef06372292cd2e8311dfd25ae84b22be03676cd..6ef9fb2a8252e82014ebebc22f82066eeb324c0d 100644 --- a/src/operators/math/math_function.cpp +++ b/src/operators/math/math_function.cpp @@ -28,16 +28,9 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); - // PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && - // dim_out.size() == - // 2, - // "The input and output of matmul be matrix"); - // - // PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) && - // platform::is_cpu_place(matrix_b.place()) - // && - // platform::is_cpu_place(matrix_out->place()), - // "Matrix must all be in CPUPlace"); + PADDLE_MOBILE_ENFORCE( + dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); int M = dim_out[0]; int N = dim_out[1]; @@ -61,16 +54,9 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); - // PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && - // dim_out.size() == - // 2, - // "The input and output of matmul be matrix"); - // - // PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) && - // platform::is_cpu_place(matrix_b.place()) - // && - // platform::is_cpu_place(matrix_out->place()), - // "Matrix must all be in CPUPlace"); + PADDLE_MOBILE_ENFORCE( + dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); int M = dim_out[0]; int N = dim_out[1]; @@ -95,16 +81,9 @@ void matmulWithPRelu(const framework::Tensor &matrix_a, bool trans_a, auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); - // PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && - // dim_out.size() == - // 2, - // "The input and output of matmul be matrix"); - // - // PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) && - // platform::is_cpu_place(matrix_b.place()) - // && - // platform::is_cpu_place(matrix_out->place()), - // "Matrix must all be in CPUPlace"); + PADDLE_MOBILE_ENFORCE( + dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); int M = dim_out[0]; int N = dim_out[1]; diff --git a/tools/ios-cmake/ios.toolchain.cmake b/tools/ios-cmake/ios.toolchain.cmake index 4db079d01de8db35fca8fbe63b59e58fd5a3463e..6000f7a8e5dffcd8693b56539f4519840ddd8be8 100644 --- a/tools/ios-cmake/ios.toolchain.cmake +++ b/tools/ios-cmake/ios.toolchain.cmake @@ -50,8 +50,13 @@ endif (CMAKE_UNAME) #include (CMakeForceCompiler) #CMAKE_C_COMPILER (/usr/bin/gcc) #CMAKE_CXX_COMPILER (/usr/bin/g++) -set(CMAKE_C_COMPILER /usr/bin/gcc) -set(CMAKE_CXX_COMPILER /usr/bin/g++) +if(USE_OPENMP) + set(CMAKE_C_COMPILER /usr/local/opt/llvm/bin/clang) + set(CMAKE_CXX_COMPILER /usr/local/opt/llvm/bin/clang++) +else() + set(CMAKE_C_COMPILER /usr/bin/gcc) + set(CMAKE_CXX_COMPILER /usr/bin/g++) +endif() set(CMAKE_AR ar CACHE FILEPATH "" FORCE) # Skip the platform compiler checks for cross compiling