未验证 提交 34716531 编写于 作者: myq406450149's avatar myq406450149 提交者: GitHub

Merge pull request #3 from PaddlePaddle/develop

merge paddlelite
...@@ -72,6 +72,9 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF) ...@@ -72,6 +72,9 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
# publish options # publish options
lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF) lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF) lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
# cv build options
lite_option(LITE_WITH_CV "Enable build cv image in lite" OFF IF NOT LITE_WITH_ARM)
# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
if(ANDROID OR IOS OR ARMLINUX) if(ANDROID OR IOS OR ARMLINUX)
...@@ -181,7 +184,7 @@ include(external/xxhash) # download install xxhash needed for x86 jit ...@@ -181,7 +184,7 @@ include(external/xxhash) # download install xxhash needed for x86 jit
include(cudnn) include(cudnn)
include(configure) # add paddle env configuration include(configure) # add paddle env configuration
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
include(cuda) include(cuda)
endif() endif()
......
...@@ -117,8 +117,12 @@ endif() ...@@ -117,8 +117,12 @@ endif()
if (LITE_WITH_ARM) if (LITE_WITH_ARM)
add_definitions("-DLITE_WITH_ARM") add_definitions("-DLITE_WITH_ARM")
if (LITE_WITH_CV)
add_definitions("-DLITE_WITH_CV")
endif()
endif() endif()
if (WITH_ARM_DOTPROD) if (WITH_ARM_DOTPROD)
add_definitions("-DWITH_ARM_DOTPROD") add_definitions("-DWITH_ARM_DOTPROD")
endif() endif()
......
...@@ -26,7 +26,8 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS ...@@ -26,7 +26,8 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib64
${CUDNN_ROOT}/lib ${CUDNN_ROOT}/lib
${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ /usr/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
/usr/lib/${TARGET_ARCH}-linux-gnu/
$ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}
$ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib $ENV{CUDNN_ROOT}/lib
......
...@@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME) ...@@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME)
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
target_link_libraries(${TARGET_NAME} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES})
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
if(nv_binary_DEPS) if(nv_binary_DEPS)
target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
...@@ -507,7 +510,7 @@ function(nv_test TARGET_NAME) ...@@ -507,7 +510,7 @@ function(nv_test TARGET_NAME)
cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest
gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} ) gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog)
common_link(${TARGET_NAME}) common_link(${TARGET_NAME})
add_test(${TARGET_NAME} ${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME})
......
...@@ -43,6 +43,11 @@ function (lite_deps TARGET) ...@@ -43,6 +43,11 @@ function (lite_deps TARGET)
foreach(var ${lite_deps_ARM_DEPS}) foreach(var ${lite_deps_ARM_DEPS})
set(deps ${deps} ${var}) set(deps ${deps} ${var})
endforeach(var) endforeach(var)
if(LITE_WITH_CV)
foreach(var ${lite_cv_deps})
set(deps ${deps} ${var})
endforeach(var)
endif()
endif() endif()
if(LITE_WITH_PROFILE) if(LITE_WITH_PROFILE)
...@@ -152,7 +157,9 @@ function(lite_cc_library TARGET) ...@@ -152,7 +157,9 @@ function(lite_cc_library TARGET)
endfunction() endfunction()
function(lite_cc_binary TARGET) function(lite_cc_binary TARGET)
set(options "") if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
set(options " -g ")
endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
...@@ -241,6 +248,7 @@ endfunction() ...@@ -241,6 +248,7 @@ endfunction()
set(arm_kernels CACHE INTERNAL "arm kernels") set(arm_kernels CACHE INTERNAL "arm kernels")
set(x86_kernels CACHE INTERNAL "x86 kernels") set(x86_kernels CACHE INTERNAL "x86 kernels")
set(cuda_kernels CACHE INTERNAL "cuda kernels")
set(fpga_kernels CACHE INTERNAL "fpga kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels")
set(npu_kernels CACHE INTERNAL "npu kernels") set(npu_kernels CACHE INTERNAL "npu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels")
...@@ -341,7 +349,7 @@ function(add_kernel TARGET device level) ...@@ -341,7 +349,7 @@ function(add_kernel TARGET device level)
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach() endforeach()
nv_library(${TARGET} SRCS ${args_SRCS} DEPS ${args_DEPS}) nv_library(${TARGET} SRCS ${args_SRCS} DEPS ${args_DEPS})
return() return()
endif() endif()
# the source list will collect for paddle_use_kernel.h code generation. # the source list will collect for paddle_use_kernel.h code generation.
......
...@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") ...@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install") set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}) set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
...@@ -116,6 +117,9 @@ if (LITE_WITH_X86) ...@@ -116,6 +117,9 @@ if (LITE_WITH_X86)
add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3) add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
endif() endif()
if(LITE_WITH_CUDA)
add_dependencies(publish_inference paddle_full_api_shared)
endif(LITE_WITH_CUDA)
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
# add cxx lib # add cxx lib
...@@ -129,6 +133,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -129,6 +133,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
#COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/model_optimize_tool" "${INFER_LITE_PUBLISH_ROOT}/bin" #COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/model_optimize_tool" "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
) )
if(NOT IOS) if(NOT IOS)
#add_dependencies(publish_inference_cxx_lib model_optimize_tool) #add_dependencies(publish_inference_cxx_lib model_optimize_tool)
...@@ -136,15 +141,17 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -136,15 +141,17 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
add_dependencies(publish_inference_cxx_lib bundle_full_api) add_dependencies(publish_inference_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cxx_lib bundle_light_api) add_dependencies(publish_inference_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cxx_lib test_model_bin) add_dependencies(publish_inference_cxx_lib test_model_bin)
if (ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux") if (ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")
add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference paddle_light_api_shared) add_dependencies(publish_inference paddle_light_api_shared)
add_custom_command(TARGET publish_inference_cxx_lib add_custom_command(TARGET publish_inference_cxx_lib
COMMAND cp ${CMAKE_BINARY_DIR}/lite/api/*.so ${INFER_LITE_PUBLISH_ROOT}/cxx/lib) COMMAND cp ${CMAKE_BINARY_DIR}/lite/api/*.so ${INFER_LITE_PUBLISH_ROOT}/cxx/lib)
endif() endif()
add_dependencies(publish_inference publish_inference_cxx_lib) add_dependencies(publish_inference publish_inference_cxx_lib)
add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a) add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
endif()
endif() endif()
else() else()
if (IOS) if (IOS)
...@@ -153,6 +160,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -153,6 +160,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
) )
add_dependencies(tiny_publish_lib bundle_light_api) add_dependencies(tiny_publish_lib bundle_light_api)
add_dependencies(publish_inference tiny_publish_lib) add_dependencies(publish_inference tiny_publish_lib)
...@@ -164,6 +172,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -164,6 +172,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/libpaddle_light_api_shared.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/libpaddle_light_api_shared.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
) )
add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared) add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference tiny_publish_cxx_lib) add_dependencies(publish_inference tiny_publish_cxx_lib)
...@@ -183,11 +192,13 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -183,11 +192,13 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
) )
add_dependencies(publish_inference_java_lib paddle_lite_jni PaddlePredictor) add_dependencies(publish_inference_java_lib paddle_lite_jni PaddlePredictor)
add_dependencies(publish_inference publish_inference_java_lib) add_dependencies(publish_inference publish_inference_java_lib)
add_custom_command(TARGET publish_inference_java_lib POST_BUILD if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/java/so/libpaddle_lite_jni.so) add_custom_command(TARGET publish_inference_java_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/java/so/libpaddle_lite_jni.so)
endif()
endif() endif()
if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND if ((ARM_TARGET_OS STREQUAL "android") AND
((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8))) ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
# copy # copy
...@@ -202,6 +213,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -202,6 +213,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
) )
add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_android_cxx_demos logging gflags)
add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
......
...@@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH) ...@@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH)
set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG") set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
endif() endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer) set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library #full api dynamic library
add_library(paddle_full_api_shared SHARED "") add_library(paddle_full_api_shared SHARED "")
target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc) target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
...@@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and ...@@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and
add_dependencies(paddle_full_api_shared xxhash) add_dependencies(paddle_full_api_shared xxhash)
target_link_libraries(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash)
endif() endif()
if(LITE_WITH_CUDA)
target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
endif(LITE_WITH_CUDA)
#light api dynamic library #light api dynamic library
lite_cc_library(paddle_light_api_shared MODULE lite_cc_library(paddle_light_api_shared MODULE
SRCS light_api_shared.cc SRCS light_api_shared.cc
...@@ -59,6 +61,7 @@ endif() ...@@ -59,6 +61,7 @@ endif()
message(STATUS "get ops ${ops}") message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get X86 kernels ${x86_kernels}")
message(STATUS "get CUDA kernels ${cuda_kernels}")
message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get Host kernels ${host_kernels}")
message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get ARM kernels ${arm_kernels}")
message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get NPU kernels ${npu_kernels}")
...@@ -76,8 +79,8 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -76,8 +79,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
CL_DEPS ${opencl_kenrels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kenrels}) FPGA_DEPS ${fpga_kernels})
endif() endif()
# for light api # for light api
...@@ -96,8 +99,8 @@ lite_cc_library(light_api SRCS light_api.cc ...@@ -96,8 +99,8 @@ lite_cc_library(light_api SRCS light_api.cc
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kenrels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kenrels}) FPGA_DEPS ${fpga_kernels})
include(ExternalProject) include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
...@@ -289,7 +292,8 @@ if(NOT IOS) ...@@ -289,7 +292,8 @@ if(NOT IOS)
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}) X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
...@@ -297,7 +301,9 @@ if(NOT IOS) ...@@ -297,7 +301,9 @@ if(NOT IOS)
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}) X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
endif() endif()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
......
...@@ -140,21 +140,28 @@ lite::Tensor *Predictor::GetInput(size_t offset) { ...@@ -140,21 +140,28 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
// get inputs names // get inputs names
std::vector<std::string> Predictor::GetInputNames() { return input_names_; } std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
// get outputnames // get outputnames
std::vector<std::string> Predictor::GetOutputNames() { return output_names_; } std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
// append the names of inputs and outputs into input_names_ and output_names_ // append the names of inputs and outputs into input_names_ and output_names_
void Predictor::PrepareFeedFetch() { void Predictor::PrepareFeedFetch() {
auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0); if (!program_) {
std::vector<cpp::OpDesc *> feeds; GenRuntimeProgram();
std::vector<cpp::OpDesc *> fetchs; }
for (size_t i = 0; i < current_block->OpsSize(); i++) { std::vector<const cpp::OpDesc *> feeds;
auto op = current_block->GetOp<cpp::OpDesc>(i); std::vector<const cpp::OpDesc *> fetchs;
const auto &insts = program_->instructions();
for (size_t i = 0; i < program_->num_instructions(); i++) {
const auto &op = insts[i].op()->op_info();
if (op->Type() == "feed") { if (op->Type() == "feed") {
feeds.push_back(op); feeds.push_back(op);
} else if (op->Type() == "fetch") { } else if (op->Type() == "fetch") {
fetchs.push_back(op); fetchs.push_back(op);
} }
} }
input_names_.resize(feeds.size()); input_names_.resize(feeds.size());
output_names_.resize(fetchs.size()); output_names_.resize(fetchs.size());
for (size_t i = 0; i < feeds.size(); i++) { for (size_t i = 0; i < feeds.size(); i++) {
...@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const { ...@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
const cpp::ProgramDesc &Predictor::program_desc() const { const cpp::ProgramDesc &Predictor::program_desc() const {
return program_desc_; return program_desc_;
} }
const RuntimeProgram &Predictor::runtime_program() const { return *program_; } const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
void Predictor::Build(const lite_api::CxxConfig &config, void Predictor::Build(const lite_api::CxxConfig &config,
...@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc, ...@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
const std::vector<Place> &valid_places, const std::vector<Place> &valid_places,
const std::vector<std::string> &passes) { const std::vector<std::string> &passes) {
program_desc_ = desc; program_desc_ = desc;
// `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places; std::vector<Place> inner_places = valid_places;
inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
inner_places.emplace_back( inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
Program program(desc, scope_, inner_places); Program program(desc, scope_, inner_places);
/// The first place in valid_places is
core::KernelPickFactor factor; core::KernelPickFactor factor;
factor.ConsiderTarget(); factor.ConsiderTarget();
factor.ConsiderPrecision(); factor.ConsiderPrecision();
factor.ConsiderDataLayout(); factor.ConsiderDataLayout();
optimizer_.Run(std::move(program), inner_places, factor, passes); optimizer_.Run(std::move(program), inner_places, factor, passes);
exec_scope_ = optimizer_.exec_scope(); exec_scope_ = optimizer_.exec_scope();
PrepareFeedFetch(); PrepareFeedFetch();
...@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const { ...@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
auto *var = exec_scope_->FindVar(name); auto *var = exec_scope_->FindVar(name);
return &var->Get<lite::Tensor>(); return &var->Get<lite::Tensor>();
} }
// get input by name // get input by name
lite::Tensor *Predictor::GetInputByName(const std::string &name) { lite::Tensor *Predictor::GetInputByName(const std::string &name) {
auto element = std::find(input_names_.begin(), input_names_.end(), name); auto element = std::find(input_names_.begin(), input_names_.end(), name);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <fstream>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
DEFINE_bool(is_run_model_optimize,
false,
"apply model_optimize_tool to model, use optimized model to test");
namespace paddle {
namespace lite_api {
void OutputOptModel(const std::string& load_model_dir,
const std::string& save_optimized_model_dir) {
lite_api::CxxConfig config;
config.set_model_dir(load_model_dir);
config.set_valid_places({
Place{TARGET(kX86), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
});
auto predictor = lite_api::CreatePaddlePredictor(config);
int ret = system(
paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
.c_str());
if (ret == 0) {
LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
}
predictor->SaveOptimizedModel(save_optimized_model_dir,
LiteModelType::kNaiveBuffer);
LOG(INFO) << "Load model from " << load_model_dir;
LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void Run(const std::string& model_dir,
const int repeat,
const int warmup_times,
const int thread_num) {
// set config and create predictor
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
config.set_threads(thread_num);
if (thread_num == 1) {
config.set_power_mode(LITE_POWER_HIGH);
} else {
config.set_power_mode(LITE_POWER_NO_BIND);
}
auto predictor = lite_api::CreatePaddlePredictor(config);
// set input
auto input_image = predictor->GetInput(0);
input_image->Resize({1, 3, 300, 300});
auto input_image_data = input_image->mutable_data<float>();
std::ifstream read_file("/data/local/tmp/pjc/ssd_img.txt");
if (!read_file.is_open()) {
LOG(INFO) << "read image file fail";
return;
}
auto input_shape = input_image->shape();
int64_t input_image_size = 1;
for (auto t : input_shape) {
input_image_size *= t;
}
for (int i = 0; i < input_image_size; i++) {
read_file >> input_image_data[i];
}
// warmup and run
for (int i = 0; i < warmup_times; ++i) {
predictor->Run();
}
auto start = lite::GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
predictor->Run();
}
// show result
auto end = lite::GetCurrentUS();
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (end - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto out = predictor->GetOutput(0);
auto out_data = out->data<float>();
LOG(INFO) << "output shape:";
auto out_shape = out->shape();
for (auto t : out_shape) {
LOG(INFO) << t;
}
LOG(INFO) << "output data:";
int output_len = 20;
for (int i = 0; i < output_len; i++) {
LOG(INFO) << out_data[i];
}
}
#endif
} // namespace lite_api
} // namespace paddle
TEST(Faster_RCNN, test_arm) {
std::string save_optimized_model_dir;
if (FLAGS_is_run_model_optimize) {
save_optimized_model_dir = FLAGS_model_dir + "opt";
paddle::lite_api::OutputOptModel(FLAGS_model_dir, save_optimized_model_dir);
}
std::string run_model_dir =
FLAGS_is_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
paddle::lite_api::Run(
run_model_dir, FLAGS_repeats, FLAGS_threads, FLAGS_warmup);
}
...@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) { ...@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) {
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
TEST(MobileNetV1, test_opencl) { TEST(MobileNetV1, test_opencl) {
std::vector<Place> valid_places({ std::vector<Place> valid_places({
Place{TARGET(kOpenCL), PRECISION(kFloat)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
Place{TARGET(kARM), PRECISION(kFloat)}, Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
TARGET(kARM), // enable kARM CPU kernel when no opencl kernel
}); });
TestModel(valid_places); TestModel(valid_places);
......
...@@ -80,7 +80,16 @@ void Main() { ...@@ -80,7 +80,16 @@ void Main() {
if (target_repr == "arm") { if (target_repr == "arm") {
valid_places.emplace_back(TARGET(kARM)); valid_places.emplace_back(TARGET(kARM));
} else if (target_repr == "opencl") { } else if (target_repr == "opencl") {
valid_places.emplace_back(TARGET(kOpenCL)); valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
valid_places.emplace_back(
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
valid_places.emplace_back(
TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel
} else if (target_repr == "x86") { } else if (target_repr == "x86") {
valid_places.emplace_back(TARGET(kX86)); valid_places.emplace_back(TARGET(kX86));
} else { } else {
......
...@@ -202,7 +202,8 @@ void conv1x1s1_gemm(const float* i_data, ...@@ -202,7 +202,8 @@ void conv1x1s1_gemm(const float* i_data,
k, k,
flag_bias, flag_bias,
bias_group, bias_group,
flag_relu); flag_relu,
ctx);
} else { } else {
sgemm_prepack(false, sgemm_prepack(false,
m, m,
...@@ -395,7 +396,8 @@ void conv_im2col_gemm(const float* i_data, ...@@ -395,7 +396,8 @@ void conv_im2col_gemm(const float* i_data,
k, k,
flag_bias, flag_bias,
bias_group, bias_group,
flag_relu); flag_relu,
ctx);
} else { } else {
int ldb = n; int ldb = n;
sgemm_prepack(false, sgemm_prepack(false,
......
...@@ -22,6 +22,28 @@ namespace lite { ...@@ -22,6 +22,28 @@ namespace lite {
namespace arm { namespace arm {
namespace math { namespace math {
inline std::vector<int> get_new_shape(
std::vector<const lite::Tensor*> list_new_shape_tensor) {
// get tensor from
std::vector<int> vec_new_shape;
for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
auto tensor = list_new_shape_tensor[i];
vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
}
return vec_new_shape;
}
template <typename T>
inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
std::vector<T> vec_new_data;
auto* new_data = new_data_tensor->data<T>();
lite::Tensor cpu_starts_tensor;
vec_new_data =
std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
return vec_new_data;
}
// The following function bilinear_interp is partially base on // The following function bilinear_interp is partially base on
// https://github.com/Tencent/ncnn/blob/master/src/layer/arm/interp_arm.cpp // https://github.com/Tencent/ncnn/blob/master/src/layer/arm/interp_arm.cpp
// Tencent is pleased to support the open source community by making ncnn // Tencent is pleased to support the open source community by making ncnn
...@@ -472,33 +494,52 @@ void nearest_interp(const float* src, ...@@ -472,33 +494,52 @@ void nearest_interp(const float* src,
void interpolate(lite::Tensor* X, void interpolate(lite::Tensor* X,
lite::Tensor* OutSize, lite::Tensor* OutSize,
std::vector<const lite::Tensor*> SizeTensor,
lite::Tensor* Scale,
lite::Tensor* Out, lite::Tensor* Out,
int out_height, int out_height,
int out_width, int out_width,
float height_scale, float scale,
float width_scale,
bool with_align, bool with_align,
std::string interpolate_type) { std::string interpolate_type) {
int in_h = X->dims()[2];
int in_w = X->dims()[3];
if (SizeTensor.size() > 0) {
auto new_size = get_new_shape(SizeTensor);
out_height = new_size[0];
out_width = new_size[1];
} else {
auto scale_tensor = Scale;
if (scale_tensor != nullptr) {
auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
scale = scale_data[0];
}
if (scale > 0) {
out_height = static_cast<int>(in_h * scale);
out_width = static_cast<int>(in_w * scale);
}
auto out_size = OutSize;
if (out_size != nullptr) {
auto out_size_data = get_new_data_from_tensor<float>(out_size);
out_height = static_cast<int>(out_size_data[0]);
out_width = static_cast<int>(out_size_data[1]);
}
}
float height_scale = scale;
float width_scale = scale;
if (out_width > 0 && out_height > 0) { if (out_width > 0 && out_height > 0) {
height_scale = static_cast<float>(out_height / X->dims()[2]); height_scale = static_cast<float>(out_height / X->dims()[2]);
width_scale = static_cast<float>(out_width / X->dims()[3]); width_scale = static_cast<float>(out_width / X->dims()[3]);
} }
if (OutSize != nullptr) { int num_cout = X->dims()[0];
auto OutSize_data = OutSize->data<int>(); int c_cout = X->dims()[1];
int h_out = OutSize_data[0]; // HW Out->Resize({num_cout, c_cout, out_height, out_width});
int w_out = OutSize_data[1]; // HW
int num_cout = Out->dims()[0];
int c_cout = Out->dims()[1];
Out->Resize({num_cout, c_cout, h_out, w_out});
}
float* dout = Out->mutable_data<float>(); float* dout = Out->mutable_data<float>();
const float* din = X->data<float>(); const float* din = X->data<float>();
int out_num = Out->dims()[0]; int out_num = Out->dims()[0];
int out_c = Out->dims()[1]; int out_c = Out->dims()[1];
int count = out_num * out_c; int count = out_num * out_c;
int in_h = X->dims()[2];
int in_w = X->dims()[3];
int out_h = Out->dims()[2]; int out_h = Out->dims()[2];
int out_w = Out->dims()[3]; int out_w = Out->dims()[3];
int spatial_in = in_h * in_w; int spatial_in = in_h * in_w;
......
...@@ -44,11 +44,12 @@ void nearest_interp(const float* src, ...@@ -44,11 +44,12 @@ void nearest_interp(const float* src,
void interpolate(lite::Tensor* X, void interpolate(lite::Tensor* X,
lite::Tensor* OutSize, lite::Tensor* OutSize,
std::vector<const lite::Tensor*> SizeTensor,
lite::Tensor* Scale,
lite::Tensor* Out, lite::Tensor* Out,
int out_height, int out_height,
int out_width, int out_width,
float height_scale, float scale,
float width_scale,
bool with_align, bool with_align,
std::string interpolate_type); std::string interpolate_type);
......
此差异已折叠。
...@@ -116,6 +116,27 @@ void pooling3x3s2p1_max(const float* din, ...@@ -116,6 +116,27 @@ void pooling3x3s2p1_max(const float* din,
int hin, int hin,
int win); int win);
void pooling3x3s1p0_max(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win);
void pooling3x3s1p0_avg(const float* din,
float* dout,
int num,
int chout,
int hout,
int wout,
int chin,
int hin,
int win,
bool exclusive);
void pooling3x3s2p1_avg(const float* din, void pooling3x3s2p1_avg(const float* din,
float* dout, float* dout,
int num, int num,
......
此差异已折叠。
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
#pragma once #pragma once
#include <cmath> #include <cmath>
#include "lite/core/context.h"
#include "lite/core/device_info.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -28,9 +30,10 @@ bool sgemv(const float* A, ...@@ -28,9 +30,10 @@ bool sgemv(const float* A,
bool transA, bool transA,
int M, int M,
int N, int N,
bool is_bias = false, bool is_bias,
const float* bias = nullptr, const float* bias,
bool is_relu = false); bool is_relu,
const ARMContext* ctx);
} // namespace math } // namespace math
} // namespace arm } // namespace arm
......
...@@ -56,6 +56,15 @@ ...@@ -56,6 +56,15 @@
CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << CudnnGetErrorInfo(status); \ CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << CudnnGetErrorInfo(status); \
} }
const int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int CUDA_GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
inline int CUDA_GET_BLOCKS(const int N, const int base) {
return (N + base - 1) / base;
}
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace cuda { namespace cuda {
......
...@@ -12,6 +12,8 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps}) ...@@ -12,6 +12,8 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale
cuda_type_trans ${cuda_static_deps}) cuda_type_trans ${cuda_static_deps})
nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps}) nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
set ( set (
math_cuda math_cuda
...@@ -21,6 +23,8 @@ set ( ...@@ -21,6 +23,8 @@ set (
cuda_type_trans cuda_type_trans
cuda_transpose cuda_transpose
cuda_elementwise cuda_elementwise
cuda_gemm
cuda_batched_gemm
) )
set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda") set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/math/batched_gemm.h"
#include <iostream>
#include "lite/core/device_info.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <>
bool BatchedGemm<float, float>::init(const bool trans_a,
const bool trans_b,
const int max_batch_size,
Context<TARGET(kCUDA)> *ctx) {
if (cu_handle_ == nullptr) {
this->exe_stream_ = ctx->exec_stream();
CUBLAS_CALL(cublasCreate(&cu_handle_));
CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
}
cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
cudaMalloc(reinterpret_cast<void **>(&A_),
3 * max_batch_size * sizeof(float *));
return true;
}
template <>
bool BatchedGemm<float, float>::run(const float alpha,
const float beta,
const float *a[],
const float *b[],
float *c[],
const int m,
const int n,
const int k,
const int batch_size) {
CHECK(a != nullptr);
CHECK(b != nullptr);
CHECK(c != nullptr);
lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
ldc_ = n;
m_ = m;
n_ = n;
k_ = k;
cudaMemcpyAsync(A_,
a,
batch_size * sizeof(const float *),
cudaMemcpyHostToDevice,
exe_stream_);
cudaMemcpyAsync(A_ + batch_size,
b,
batch_size * sizeof(const float *),
cudaMemcpyHostToDevice,
exe_stream_);
cudaMemcpyAsync(A_ + batch_size * 2,
c,
batch_size * sizeof(float *),
cudaMemcpyHostToDevice,
exe_stream_);
CUBLAS_CALL(cublasSgemmBatched(cu_handle_,
cu_trans_b_,
cu_trans_a_,
n_,
m_,
k_,
&alpha,
const_cast<const float **>(A_ + batch_size),
ldb_,
const_cast<const float **>(A_),
lda_,
&beta,
A_ + batch_size * 2,
ldc_,
batch_size));
return true;
}
template <>
bool BatchedGemm<float, float>::run(const float alpha,
const float beta,
const float *a[],
const int m,
const int n,
const int k,
const int batch_size) {
CHECK(a != nullptr);
lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
ldc_ = n;
m_ = m;
n_ = n;
k_ = k;
cudaMemcpyAsync(A_,
a,
3 * batch_size * sizeof(const float *),
cudaMemcpyDefault,
exe_stream_);
CUBLAS_CALL(cublasSgemmBatched(cu_handle_,
cu_trans_b_,
cu_trans_a_,
n_,
m_,
k_,
&alpha,
const_cast<const float **>(A_ + batch_size),
ldb_,
const_cast<const float **>(A_),
lda_,
&beta,
A_ + batch_size * 2,
ldc_,
batch_size));
return true;
}
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cudnn.h>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/context.h"
#include "lite/core/target_wrapper.h"
#include "lite/operators/op_params.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename PtypeIn, typename PtypeOut>
class BatchedGemm {
public:
BatchedGemm() : cu_handle_(nullptr) {}
~BatchedGemm() {
if (A_ != nullptr) {
cudaFree(A_);
}
}
bool init(const bool trans_a,
const bool trans_b,
const int max_batch_size,
Context<TARGET(kCUDA)>* ctx);
bool run(const PtypeOut alpha,
const PtypeOut beta,
const PtypeIn* a[],
const PtypeIn* b[],
PtypeOut* c[],
const int m,
const int n,
const int k,
const int batch_size);
bool run(const PtypeOut alpha,
const PtypeOut beta,
const PtypeIn* a[],
const int m,
const int n,
const int k,
const int batch_size);
private:
cudaStream_t exe_stream_;
cublasHandle_t cu_handle_;
cublasOperation_t cu_trans_a_;
cublasOperation_t cu_trans_b_;
int m_{-1};
int n_{-1};
int k_{-1};
int lda_{-1};
int ldb_{-1};
int ldc_{-1};
PtypeIn** A_{nullptr};
};
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/math/gemm.h"
#include <iostream>
#include "lite/core/device_info.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <>
bool Gemm<float, float>::init(const bool trans_a,
bool trans_b,
const int m,
const int n,
const int k,
Context<TARGET(kCUDA)> *ctx) {
if (cu_handle_ == nullptr) {
this->exe_stream_ = ctx->exec_stream();
CUBLAS_CALL(cublasCreate(&cu_handle_));
CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
}
lda_ = (!trans_a) ? k : m;
ldb_ = (!trans_b) ? n : k;
ldc_ = n;
m_ = m;
n_ = n;
k_ = k;
cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
return true;
}
template <>
bool Gemm<float, float>::init(const bool trans_a,
bool trans_b,
const int m,
const int n,
const int k,
const int lda,
const int ldb,
const int ldc,
Context<TARGET(kCUDA)> *ctx) {
if (cu_handle_ == nullptr) {
this->exe_stream_ = ctx->exec_stream();
CUBLAS_CALL(cublasCreate(&cu_handle_));
CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
}
m_ = m;
n_ = n;
k_ = k;
lda_ = lda;
ldb_ = ldb;
ldc_ = ldc;
cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
return true;
}
template <>
bool Gemm<float, float>::run(const float alpha,
const float beta,
const float *a,
const float *b,
float *c,
Context<TARGET(kCUDA)> *ctx) {
CUBLAS_CALL(cublasSgemm(cu_handle_,
cu_trans_b_,
cu_trans_a_,
n_,
m_,
k_,
&alpha,
b,
ldb_,
a,
lda_,
&beta,
c,
ldc_));
return true;
}
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cudnn.h>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/context.h"
#include "lite/core/target_wrapper.h"
#include "lite/operators/op_params.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename PtypeIn, typename PtypeOut>
class Gemm {
public:
Gemm() : cu_handle_(nullptr) {}
~Gemm() {}
bool init(const bool trans_a,
const bool trans_b,
const int m,
const int n,
const int k,
Context<TARGET(kCUDA)>* ctx);
bool init(const bool trans_a,
const bool trans_b,
const int m,
const int n,
const int k,
const int lda,
const int ldb,
const int ldc,
Context<TARGET(kCUDA)>* ctx);
bool run(const PtypeOut alpha,
const PtypeOut beta,
const PtypeIn* a,
const PtypeIn* b,
PtypeOut* c,
Context<TARGET(kCUDA)>* ctx);
private:
cudaStream_t exe_stream_;
cublasHandle_t cu_handle_;
cublasOperation_t cu_trans_a_;
cublasOperation_t cu_trans_b_;
int m_{-1};
int n_{-1};
int k_{-1};
int lda_{-1};
int ldb_{-1};
int ldc_{-1};
};
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
...@@ -67,7 +67,7 @@ std::string UniqueName(const std::string& prefix) { ...@@ -67,7 +67,7 @@ std::string UniqueName(const std::string& prefix) {
return prefix + "_" + std::to_string(counter); return prefix + "_" + std::to_string(counter);
} }
ge::DataType PrecisionConverter(PrecisionType itype) { ge::DataType CvtPrecisionType(PrecisionType itype) {
ge::DataType otype = ge::DT_FLOAT; ge::DataType otype = ge::DT_FLOAT;
switch (itype) { switch (itype) {
case PRECISION(kFloat): case PRECISION(kFloat):
...@@ -80,14 +80,14 @@ ge::DataType PrecisionConverter(PrecisionType itype) { ...@@ -80,14 +80,14 @@ ge::DataType PrecisionConverter(PrecisionType itype) {
otype = ge::DT_INT32; otype = ge::DT_INT32;
break; break;
default: default:
LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(itype) LOG(FATAL) << "[NPU] Can not convert precision type("
<< ") from Lite to NPU"; << PrecisionToStr(itype) << ") from Lite to NPU";
break; break;
} }
return otype; return otype;
} }
ge::Format DataLayoutConverter(DataLayoutType itype) { ge::Format CvtDataLayoutType(DataLayoutType itype) {
ge::Format otype = ge::FORMAT_NCHW; ge::Format otype = ge::FORMAT_NCHW;
switch (itype) { switch (itype) {
case DATALAYOUT(kNCHW): case DATALAYOUT(kNCHW):
...@@ -95,17 +95,17 @@ ge::Format DataLayoutConverter(DataLayoutType itype) { ...@@ -95,17 +95,17 @@ ge::Format DataLayoutConverter(DataLayoutType itype) {
break; break;
// TODO(hong19860320) support more data layout type // TODO(hong19860320) support more data layout type
default: default:
LOG(FATAL) << "Can not convert data layout type(" LOG(FATAL) << "[NPU] Can not convert data layout type("
<< DataLayoutToStr(itype) << ") from Lite to NPU"; << DataLayoutToStr(itype) << ") from Lite to NPU";
break; break;
} }
return otype; return otype;
} }
ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor, ge::TensorPtr CvtTensor(lite::Tensor* in_tensor,
std::vector<int64_t> out_shape, std::vector<int64_t> out_shape,
PrecisionType in_ptype, PrecisionType in_ptype,
DataLayoutType in_ltype) { DataLayoutType in_ltype) {
uint8_t* in_data = nullptr; uint8_t* in_data = nullptr;
auto in_size = in_tensor->dims().production(); auto in_size = in_tensor->dims().production();
auto in_shape = in_tensor->dims().Vectorize(); auto in_shape = in_tensor->dims().Vectorize();
...@@ -123,10 +123,10 @@ ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor, ...@@ -123,10 +123,10 @@ ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor,
in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>()); in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
in_bytes = in_size * sizeof(int8_t); in_bytes = in_size * sizeof(int8_t);
} else { } else {
LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype); LOG(FATAL) << "[NPU] Unknow precision type " << PrecisionToStr(in_ptype);
} }
ge::DataType out_ptype = PrecisionConverter(in_ptype); ge::DataType out_ptype = CvtPrecisionType(in_ptype);
ge::Format out_ltype = DataLayoutConverter(in_ltype); ge::Format out_ltype = CvtDataLayoutType(in_ltype);
ge::TensorDesc out_desc(ge::Shape(out_shape), out_ltype, out_ptype); ge::TensorDesc out_desc(ge::Shape(out_shape), out_ltype, out_ptype);
CHECK_EQ(out_ltype, ge::FORMAT_NCHW); CHECK_EQ(out_ltype, ge::FORMAT_NCHW);
...@@ -140,6 +140,31 @@ ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor, ...@@ -140,6 +140,31 @@ ge::TensorPtr CvtFromLiteTensor(lite::Tensor* in_tensor,
return out_tensor; return out_tensor;
} }
int CvtActMode(std::string act_type) {
int act_mode = 1;
if (act_type == "sigmod") {
act_mode = 0;
} else if (act_type == "relu") {
act_mode = 1;
} else if (act_type == "tanh") {
act_mode = 2;
} else if (act_type == "elu") {
act_mode = 4;
} else if (act_type == "abs") {
act_mode = 6;
} else if (act_type == "softsign") {
act_mode = 8;
} else if (act_type == "softplus") {
act_mode = 9;
} else if (act_type == "hardsigmoid") {
act_mode = 10;
} else {
// TODO(hong19860320) support more activation mode
LOG(FATAL) << "[NPU] Unsupported activation type " << act_type;
}
return act_mode;
}
bool HasInputArg(const OpInfo* op_info, bool HasInputArg(const OpInfo* op_info,
const Scope* scope, const Scope* scope,
const std::string& argname) { const std::string& argname) {
......
...@@ -31,117 +31,6 @@ ...@@ -31,117 +31,6 @@
// Extended Ops of HIAI DDK // Extended Ops of HIAI DDK
namespace ge { namespace ge {
/**
* Multiply the matrix x1 by the matrix x2 to generate x1 * x2.
* The inputs must be two-dimensional matrices and the inner dimension of "x1"
* (after being transposed if transpose_x1 is true) must match the outer
* dimension of "x2" (after being transposed if transposed_x2 is true). <Input>
* x : the first input tensor, must be non const op.
* w : the second input tensor, must be const op.
* bias: the optional bias tensor, must be const op.
* <Output>
* y : the output tensor.
* <Attr>
* has_bias: If true, enable input bias.
*/
REG_OP(MatMul)
.INPUT(x, TensorType({DT_FLOAT}))
.INPUT(w, TensorType({DT_FLOAT}))
.OPTIONAL_INPUT(bias, TensorType({DT_FLOAT})) // bias must be const input
.OUTPUT(y, TensorType({DT_FLOAT}))
.ATTR(has_bias, AttrValue::BOOL{false}) // when has input::bias,set true
.OP_END();
/**
* Computes the gradients of convolution with respect to the input.
* <Input>
* input_sizes : An integer vector representing the shape of input,
* where input is a 4-D [batch, height, width, channels] tensor.
* filter : the filter tensor, with shape [H , W, filter_channel,
* filter_number], filter_channel must be same as x channel.
* x : The input tensor.
* <Output>
* y : The output tensor.
* <Attr>
* format: 0: NCHW. 1: NHWC
* group : 1: default
* num_output : 0: default, num_output must be equal to
* (filter_channel * group)
* pad : Padding for the beginning and ending along each axis
* stride : Stride along each axis.
* dilation : dilation value along each axis of the filter.
* pad_mode : 0:NOTSET, 5:VALID 6:SAME. defaul value is 0:NOTSET
* bias_term : 0: default
* kernel : The shape of the convolution kernel
*/
REG_OP(Deconvolution)
.INPUT(input_sizes, TensorType({DT_UINT8}))
.INPUT(filter, TensorType({DT_FLOAT}))
.INPUT(x, TensorType({DT_FLOAT}))
.OPTIONAL_INPUT(b, TensorType({DT_FLOAT}))
.OUTPUT(y, TensorType({DT_FLOAT}))
.ATTR(mode, AttrValue::INT{1})
.ATTR(format, AttrValue::INT{1})
.ATTR(group, AttrValue::INT{1})
.ATTR(num_output, AttrValue::INT{0})
.ATTR(pad, AttrValue::LIST_INT({0, 0, 0, 0}))
.ATTR(stride, AttrValue::LIST_INT({1, 1}))
.ATTR(dilation, AttrValue::LIST_INT({1, 1}))
.ATTR(pad_mode, AttrValue::INT{0})
.ATTR(bias_term, AttrValue::INT{0})
.ATTR(kernel, AttrValue::LIST_INT({0, 0}))
.OP_END();
/**
* Resize images to size using bilinear interpolation.
* <Input>
* x : The tensor of 4-D
* w : A int32 Tensor of 2 elements: [height, width].
* <Output>
* y : the output tensor
* <Attr>
* align_corners : If true, the centers of the 4 corner pixels of the
* input and output tensors are aligned, preserving the values at the corner
* pixels.
* output_dim_mode : Defaults 2, including 0: zoom_factor , 1:
* shrink_factor, 2: height/width. when output_dim_mode=2, the output-dim is
* controled by the [height, width] of w.
* shrink_factor : shrink factor.
* zoom_factor : zoom factor.
* pad_begin : begin of pad.
* pad_end : end of pad.
*/
REG_OP(ResizeBilinear)
.INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
.INPUT(w, TensorType({DT_FLOAT, DT_INT32}))
.OUTPUT(y, TensorType({DT_FLOAT, DT_INT32}))
.ATTR(align_corners, AttrValue::BOOL{false})
.ATTR(output_dim_mode, AttrValue::INT{2})
.ATTR(shrink_factor, AttrValue::INT{1})
.ATTR(zoom_factor, AttrValue::INT{1})
.ATTR(pad_begin, AttrValue::INT{0})
.ATTR(pad_end, AttrValue::INT{0})
.OP_END();
/**
* Resize images to size using nearest neighbor interpolation.
* <Input>
* image : Resize images to size using nearest neighbor interpolation.
* size : Must be one dimension and two elements
* <Output>
* output : the output tensor
* <Attr>
* align_corners : If true, the centers of the 4 corner pixels of the
* input and output tensors are aligned, preserving the values at the corner
* pixels. Defaults to false
*/
REG_OP(ResizeNearestNeighbor)
.INPUT(image, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
.INPUT(size, TensorType({DT_INT32}))
.OUTPUT(output, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
.ATTR(align_corners, AttrValue::BOOL{false})
.OP_END();
/** /**
* Pads a tensor. * Pads a tensor.
* <Input> * <Input>
...@@ -192,14 +81,14 @@ bool BuildModel(std::vector<ge::Operator>& inputs, // NOLINT ...@@ -192,14 +81,14 @@ bool BuildModel(std::vector<ge::Operator>& inputs, // NOLINT
std::string UniqueName(const std::string& prefix); std::string UniqueName(const std::string& prefix);
ge::DataType PrecisionConverter(PrecisionType itype); ge::DataType CvtPrecisionType(PrecisionType itype);
ge::Format DataLayoutConverter(DataLayoutType itype); ge::Format CvtDataLayoutType(DataLayoutType itype);
ge::TensorPtr CvtFromLiteTensor(Tensor* in_tensor, ge::TensorPtr CvtTensor(Tensor* in_tensor,
std::vector<int64_t> out_shape = {}, std::vector<int64_t> out_shape = {},
PrecisionType in_ptype = PRECISION(kFloat), PrecisionType in_ptype = PRECISION(kFloat),
DataLayoutType in_ltype = DATALAYOUT(kNCHW)); DataLayoutType in_ltype = DATALAYOUT(kNCHW));
template <typename T> template <typename T>
ge::TensorPtr CreateTensorAndFillData(std::vector<T> data, ge::TensorPtr CreateTensorAndFillData(std::vector<T> data,
...@@ -214,7 +103,7 @@ ge::TensorPtr CreateTensorAndFillData(std::vector<T> data, ...@@ -214,7 +103,7 @@ ge::TensorPtr CreateTensorAndFillData(std::vector<T> data,
} else if (info == typeid(int32_t)) { } else if (info == typeid(int32_t)) {
type = ge::DT_INT32; type = ge::DT_INT32;
} else { } else {
LOG(FATAL) << "Unknow value type " << info.name(); LOG(FATAL) << "[NPU] Unknow value type " << info.name();
} }
if (shape.empty()) { if (shape.empty()) {
shape = {static_cast<int64_t>(data.size())}; shape = {static_cast<int64_t>(data.size())};
...@@ -245,6 +134,8 @@ ge::TensorPtr CreateTensorAndFillData(T value, ...@@ -245,6 +134,8 @@ ge::TensorPtr CreateTensorAndFillData(T value,
return CreateTensorAndFillData(data, shape, format); return CreateTensorAndFillData(data, shape, format);
} }
int CvtActMode(std::string act_type);
bool HasInputArg(const OpInfo* op_info, bool HasInputArg(const OpInfo* op_info,
const Scope* scope, const Scope* scope,
const std::string& argname); const std::string& argname);
......
...@@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() { ...@@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() {
do { \ do { \
cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \ cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \
if (cl_func##_ == nullptr) { \ if (cl_func##_ == nullptr) { \
LOG(ERROR) << "Cannot find the " << #cl_func \ LOG(FATAL) << "Cannot find the " << #cl_func \
<< " symbol in libOpenCL.so!"; \ << " symbol in libOpenCL.so!"; \
break; \ break; \
} \ } \
......
...@@ -39,7 +39,12 @@ void MatMulJitCode::genCode() { ...@@ -39,7 +39,12 @@ void MatMulJitCode::genCode() {
size_t wgt_offset = 0; size_t wgt_offset = 0;
for (size_t g = 0; g < groups.size(); ++g) { for (size_t g = 0; g < groups.size(); ++g) {
size_t x_offset = 0; size_t x_offset = 0;
size_t wgt_offset_tmp = 0;
for (int i = 0; i < g; ++i) {
wgt_offset_tmp += groups[i] * block_len;
}
for (int k = 0; k < k_; ++k) { for (int k = 0; k < k_; ++k) {
wgt_offset = wgt_offset_tmp;
vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]); vbroadcastss(zmm_t(x_reg_idx), ptr[param_x + x_offset]);
// clean // clean
if (k == 0) { if (k == 0) {
...@@ -48,7 +53,8 @@ void MatMulJitCode::genCode() { ...@@ -48,7 +53,8 @@ void MatMulJitCode::genCode() {
} }
} }
for (int i = 0; i < groups[g]; ++i) { for (int i = 0; i < groups[g]; ++i) {
vmovups(zmm_t(w_reg_idx), ptr[reg_ptr_wgt + wgt_offset]); vmovups(zmm_t(w_reg_idx),
ptr[reg_ptr_wgt + wgt_offset + k * n_ * sizeof(float)]);
vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx)); vfmadd231ps(zmm_t(i), zmm_t(w_reg_idx), zmm_t(x_reg_idx));
wgt_offset += block_len; wgt_offset += block_len;
} }
......
...@@ -50,7 +50,8 @@ math_library(unpooling) ...@@ -50,7 +50,8 @@ math_library(unpooling)
math_library(vol2col) math_library(vol2col)
## math_library(prelu) ## math_library(prelu)
math_library(tree2col DEPS math_function) math_library(tree2col DEPS math_function)
math_library(sequence_topk_avg_pooling)
math_library(search_fc DEPS blas dynload_mklml)
# cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) # cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
# cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) # cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
# cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) # cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
......
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/x86/math/search_fc.h"
#include <algorithm>
#include <vector>
namespace paddle {
namespace lite {
namespace x86 {
namespace math {
/*
* All tensors' dimension should be the same and the values of
* each dimension must be the same, except the axis dimension.
*/
template <typename T>
class SearchFcFunctor<lite::TargetType::kX86, T> {
public:
void operator()(const lite::X86Context& context,
const lite::Tensor& bottom,
const lite::Tensor& w,
const lite::Tensor& b,
lite::Tensor* top,
int out_size) {
int batch = bottom.dims()[0];
int _out = w.dims()[0]; // 100
int _in = w.dims()[1]; // 228
lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
const auto bottom_data = bottom.data<T>();
auto top_data = top->mutable_data<T>(lite::TargetType::kX86);
const auto weights = w.data<T>();
auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
call_gemm<lite::X86Context, T>(blas,
CblasNoTrans,
CblasTrans,
batch,
_out,
_in,
1.0f,
bottom_data,
weights,
0.0f,
top_data);
if (true) {
const auto* bias_data = b.data<T>();
for (int i = 0; i < batch; ++i) {
// add bias here
sse_eltadd(top_data + i * _out, bias_data, top_data + i * _out, _out);
}
}
}
// private:
};
#define DEFINE_FUNCTOR(type) \
template class SearchFcFunctor<lite::TargetType::kX86, type>;
FOR_ALL_TYPES(DEFINE_FUNCTOR);
} // namespace math
} // namespace x86
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "lite/backends/x86/math/blas.h"
#include "lite/backends/x86/mklml.h"
#include "lite/core/context.h"
#include "lite/core/tensor.h"
#include "lite/fluid/data_type.h"
namespace paddle {
namespace lite {
namespace x86 {
namespace math {
template <typename DeviceContext, typename T>
void call_gemm(const BlasT<lite::TargetType::kX86, T> blas,
const CBLAS_TRANSPOSE TransA,
const CBLAS_TRANSPOSE TransB,
const int M,
const int N,
const int K,
const T alpha,
const T* A,
const T* B,
const T beta,
T* C) {
#ifndef __NAIVE_GEMM__
int lda = (TransA == CblasNoTrans) ? K : M;
int ldb = (TransB == CblasNoTrans) ? N : K;
blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
#else
naive::gemm((TransA == CblasTrans),
(TransB == CblasTrans),
M,
N,
K,
alpha,
A,
B,
beta,
C);
#endif // !__NAIVE_GEMM__
}
// To align with Lego
#ifndef LEGO_USE_FLOAT
#define LEGO_USE_FLOAT
#endif
#ifndef LEGO_SSE
#define LEGO_SSE
#endif
#if defined(LEGO_USE_FLOAT)
#define __m256x __m256
#define __m128x __m128
static const unsigned int AVX_STEP_SIZE = 8;
static const unsigned int SSE_STEP_SIZE = 4;
static const unsigned int AVX_CUT_LEN_MASK = 7U;
static const unsigned int SSE_CUT_LEN_MASK = 3U;
#define _mm256_setzero_px _mm256_setzero_ps
#define _mm256_mul_px _mm256_mul_ps
#define _mm256_add_px _mm256_add_ps
#define _mm256_load_px _mm256_loadu_ps
#define _mm256_hadd_px _mm256_hadd_ps
#define _mm256_permute2f128_px _mm256_permute2f128_ps
#define _mm256_store_px _mm256_storeu_ps
#define _mm256_broadcast_sx _mm256_broadcast_ss
#define _mm256_castpx256_px128 _mm256_castps256_ps128
#define _mm256_max_px _mm256_max_ps
#define _mm256_sub_px _mm256_sub_ps
#define _mm256_set1_px _mm256_set1_ps
#define _mm256_sqrt_px _mm256_sqrt_ps
#define _mm256_div_px _mm256_div_ps
#define _mm_setzero_px _mm_setzero_ps
#define _mm_add_px _mm_add_ps
#define _mm_mul_px _mm_mul_ps
#define _mm_load_px _mm_loadu_ps
#define _mm_hadd_px _mm_hadd_ps
#define _mm_store_sx _mm_store_ss
#define _mm_store_px _mm_storeu_ps
#define _mm_load1_px _mm_load1_ps
#define _mm_max_px _mm_max_ps
#define _mm_sub_px _mm_sub_ps
#define _mm_set1_px _mm_set1_ps
#define _mm_sqrt_px _mm_sqrt_ps
#define _mm_div_px _mm_div_ps
#elif defined(LEGO_USE_DOUBLE)
#define __m256x __m256d
#define __m128x __m128d
static const unsigned int AVX_STEP_SIZE = 4;
static const unsigned int SSE_STEP_SIZE = 2;
static const unsigned int AVX_CUT_LEN_MASK = 3U;
static const unsigned int SSE_CUT_LEN_MASK = 1U;
#define _mm256_setzero_px _mm256_setzero_pd
#define _mm256_mul_px _mm256_mul_pd
#define _mm256_add_px _mm256_add_pd
#define _mm256_load_px _mm256_loadu_pd
#define _mm256_hadd_px _mm256_hadd_pd
#define _mm256_permute2f128_px _mm256_permute2f128_pd
#define _mm256_store_px _mm256_storeu_pd
#define _mm256_broadcast_sx _mm256_broadcast_sd
#define _mm256_castpx256_px128 _mm256_castpd256_pd128
#define _mm256_max_px _mm256_max_pd
#define _mm256_sub_px _mm256_sub_pd
#define _mm256_set1_px _mm256_set1_pd
#define _mm256_sqrt_px _mm256_sqrt_pd
#define _mm256_div_px _mm256_div_pd
#define _mm_setzero_px _mm_setzero_pd
#define _mm_add_px _mm_add_pd
#define _mm_mul_px _mm_mul_pd
#define _mm_load_px _mm_loadu_pd
#define _mm_hadd_px _mm_hadd_pd
#define _mm_store_sx _mm_store_sd
#define _mm_store_px _mm_storeu_pd
#define _mm_load1_px _mm_load1_pd
#define _mm_max_px _mm_max_pd
#define _mm_sub_px _mm_sub_pd
#define _mm_set1_px _mm_set1_pd
#define _mm_sqrt_px _mm_sqrt_pd
#define _mm_div_px _mm_div_pd
#endif
template <typename T>
inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) {
unsigned int jjj, lll;
jjj = lll = 0;
#if defined(LEGO_AVX)
lll = len & ~AVX_CUT_LEN_MASK;
for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
_mm256_store_px(
z + jjj,
_mm256_add_px(_mm256_load_px(x + jjj), _mm256_load_px(y + jjj)));
}
#elif defined(LEGO_SSE)
lll = len & ~SSE_CUT_LEN_MASK;
for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
_mm_store_px(z + jjj,
_mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj)));
}
#endif
for (; jjj < len; jjj++) {
z[jjj] = x[jjj] + y[jjj];
}
}
template <lite::TargetType Target, typename T>
class SearchFcFunctor {
public:
void operator()(const lite::Context<Target>& context,
const lite::Tensor& X,
const lite::Tensor& W,
const lite::Tensor& b,
lite::Tensor* Out,
int out_size);
};
} // namespace math
} // namespace x86
} // namespace lite
} // namespace paddle
#define FOR_ALL_TYPES(macro) macro(float);
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/x86/math/sequence_topk_avg_pooling.h"
#include <algorithm>
#include <vector>
namespace paddle {
namespace lite {
namespace x86 {
namespace math {
template <typename T>
void get_topk_pos(const T* data, int length, int k, int* pos, bool debug) {
size_t real_k = k < length ? k : length;
std::vector<T> v(data, data + length);
std::vector<int> topk_pos;
T min_val = -10000000.0;
while (topk_pos.size() < real_k) {
T max_val = min_val;
int max_pos = -1;
for (int i = 0; i < length; ++i) {
if (v[i] > max_val) {
max_pos = i;
max_val = v[i];
}
}
assert(max_pos >= 0);
topk_pos.push_back(max_pos);
v[max_pos] = min_val;
}
assert(topk_pos.size() > 0);
while (topk_pos.size() < (size_t)k) {
topk_pos.push_back(-1);
}
for (size_t i = 0; i < topk_pos.size(); ++i) {
pos[i] = topk_pos[i];
}
}
/*
* All tensors' dimension should be the same and the values of
* each dimension must be the same, except the axis dimension.
*/
template <typename T>
class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
public:
void operator()(const lite::Tensor& in,
const lite::Tensor& row,
const lite::Tensor& col,
lite::Tensor* out,
lite::Tensor* pos,
int channel_num,
std::vector<int> topks) {
auto k_num = topks.size();
auto max_k = topks[topks.size() - 1];
std::vector<int64_t> vec_pos_shape;
auto in_lod = in.lod()[0];
auto row_lod = row.lod()[0];
auto col_lod = col.lod()[0];
int batch_size = row_lod.size() - 1;
int pos_total_size = row_lod[batch_size] * channel_num * max_k;
vec_pos_shape.push_back(pos_total_size);
lite::DDim dims(vec_pos_shape);
pos->Resize(dims);
auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86);
int offset = 0;
std::vector<size_t> vec_out_lod;
vec_out_lod.reserve(batch_size + 1);
for (int i = 0; i <= batch_size; ++i) {
offset = row_lod[i];
vec_out_lod.push_back(offset);
}
lite::LoD lod_temp;
lod_temp.push_back(vec_out_lod);
out->set_lod(lod_temp);
auto in_data = in.data<T>();
auto out_data = out->mutable_data<T>(lite::TargetType::kX86);
T* sum_data = new T[max_k];
for (int i = 0; i < batch_size; ++i) {
int total_size = in_lod[i + 1] - in_lod[i];
int row_size = row_lod[i + 1] - row_lod[i];
int col_size = col_lod[i + 1] - col_lod[i];
CHECK_EQ(total_size, channel_num * row_size * col_size)
<< "size wrong in sequence_topk_avg_pooling_op!";
int feature_num = row_size * col_size;
for (int j = 0; j < channel_num; ++j) {
auto input_offset_feature_data = in_data + in_lod[i] + j * feature_num;
for (int r = 0; r < row_size; ++r) {
auto row_data = input_offset_feature_data + r * col_size;
auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
r * channel_num * max_k + j * max_k;
auto out_slice_data = out_data + row_lod[i] * channel_num * k_num +
r * channel_num * k_num + j * k_num;
get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
if (pos_slice_data[0] == -1) {
sum_data[0] = 0.0;
} else {
sum_data[0] = row_data[pos_slice_data[0]];
}
for (int k = 1; k < max_k; ++k) {
if (pos_slice_data[k] == -1) {
sum_data[k] = sum_data[k - 1];
} else {
sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]];
}
}
for (size_t k = 0; k < k_num; ++k) {
out_slice_data[k] = sum_data[topks[k] - 1] / topks[k];
}
}
}
}
delete[] sum_data;
}
};
#define DEFINE_FUNCTOR(type) \
template class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, type>;
FOR_ALL_TYPES(DEFINE_FUNCTOR);
} // namespace math
} // namespace x86
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "lite/core/context.h"
#include "lite/core/tensor.h"
#include "lite/fluid/data_type.h"
namespace paddle {
namespace lite {
namespace x86 {
namespace math {
template <typename T>
void get_topk_pos(
const T* data, int length, int k, int* pos, bool debug = false);
template <lite::TargetType Target, typename T>
class SequenceTopkAvgPoolingFunctor {
public:
void operator()(const lite::Tensor& X,
const lite::Tensor& ROW,
const lite::Tensor& COLUMN,
lite::Tensor* Out,
lite::Tensor* pos,
int channel_num,
std::vector<int> topks);
};
} // namespace math
} // namespace x86
} // namespace lite
} // namespace paddle
#define FOR_ALL_TYPES(macro) macro(float);
...@@ -207,6 +207,13 @@ class Context<TargetType::kCUDA> { ...@@ -207,6 +207,13 @@ class Context<TargetType::kCUDA> {
ctx->cublas_fp32_ = cublas_fp32_; ctx->cublas_fp32_ = cublas_fp32_;
} }
CUDAContext& operator=(const CUDAContext& context) {
this->Init(
context.device_id_, context.exec_stream_id_, context.io_stream_id_);
this->cublas_fp32_ = context.cublas_fp32_;
return *this;
}
const cudaStream_t& exec_stream() const { return exec_stream_; } const cudaStream_t& exec_stream() const { return exec_stream_; }
void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; } void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
...@@ -232,6 +239,13 @@ class Context<TargetType::kCUDA> { ...@@ -232,6 +239,13 @@ class Context<TargetType::kCUDA> {
std::string name() const { return "CUDAContext"; } std::string name() const { return "CUDAContext"; }
CUDAContext& operator=(const CUDAContext& context) {
this->Init(
context.device_id_, context.exec_stream_id_, context.io_stream_id_);
cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
return *this;
}
private: private:
int device_id_; int device_id_;
// overall information // overall information
......
...@@ -59,6 +59,12 @@ namespace paddle { ...@@ -59,6 +59,12 @@ namespace paddle {
namespace lite { namespace lite {
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
thread_local lite_api::PowerMode DeviceInfo::mode_;
thread_local ARMArch DeviceInfo::arch_;
thread_local int DeviceInfo::mem_size_;
thread_local std::vector<int> DeviceInfo::active_ids_;
thread_local TensorLite DeviceInfo::workspace_;
thread_local int64_t DeviceInfo::count_ = 0;
#ifdef TARGET_IOS #ifdef TARGET_IOS
const int DEFAULT_L1_CACHE_SIZE = 64 * 1024; const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
...@@ -1033,7 +1039,7 @@ int DeviceInfo::Setup() { ...@@ -1033,7 +1039,7 @@ int DeviceInfo::Setup() {
<< ", max freq: " << max_freqs_[i] << ", max freq: " << max_freqs_[i]
<< ", min freq: " << min_freqs_[i] << ", min freq: " << min_freqs_[i]
<< ", cluster ID: " << cluster_ids_[core_ids_[i]] << ", cluster ID: " << cluster_ids_[core_ids_[i]]
<< ", CPU ARCH: A" << archs_[i]; << ", CPU ARCH: A" << static_cast<int>(archs_[i]);
} }
LOG(INFO) << "L1 DataCache size is: "; LOG(INFO) << "L1 DataCache size is: ";
for (int i = 0; i < core_num_; ++i) { for (int i = 0; i < core_num_; ++i) {
...@@ -1087,7 +1093,7 @@ void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { ...@@ -1087,7 +1093,7 @@ void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
RequestPowerRandLowMode(shift_num, thread_num); RequestPowerRandLowMode(shift_num, thread_num);
break; break;
default: default:
LOG(FATAL) << "Unsupported power mode: " << mode; LOG(FATAL) << "Unsupported power mode: " << static_cast<int>(mode);
break; break;
} }
if (active_ids_.empty()) { if (active_ids_.empty()) {
......
...@@ -79,7 +79,6 @@ class DeviceInfo { ...@@ -79,7 +79,6 @@ class DeviceInfo {
int core_num_; int core_num_;
std::vector<int> max_freqs_; std::vector<int> max_freqs_;
std::vector<int> min_freqs_; std::vector<int> min_freqs_;
int mem_size_;
std::string dev_name_; std::string dev_name_;
std::vector<int> L1_cache_; std::vector<int> L1_cache_;
...@@ -94,14 +93,15 @@ class DeviceInfo { ...@@ -94,14 +93,15 @@ class DeviceInfo {
std::vector<bool> fp16_; std::vector<bool> fp16_;
std::vector<bool> dot_; std::vector<bool> dot_;
ARMArch arch_;
// LITE_POWER_HIGH stands for using big cores, // LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core, // LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores // LITE_POWER_FULL stands for using all cores
lite_api::PowerMode mode_; static thread_local lite_api::PowerMode mode_;
std::vector<int> active_ids_; static thread_local ARMArch arch_;
TensorLite workspace_; static thread_local int mem_size_;
int64_t count_{0}; static thread_local std::vector<int> active_ids_;
static thread_local TensorLite workspace_;
static thread_local int64_t count_;
void SetDotInfo(int argc, ...); void SetDotInfo(int argc, ...);
void SetFP16Info(int argc, ...); void SetFP16Info(int argc, ...);
...@@ -119,7 +119,6 @@ class DeviceInfo { ...@@ -119,7 +119,6 @@ class DeviceInfo {
DeviceInfo() = default; DeviceInfo() = default;
}; };
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
template <TargetType Type> template <TargetType Type>
......
...@@ -34,13 +34,16 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -34,13 +34,16 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
} }
// fuse quantized node and dequant node // fuse quantized node and dequant node
std::vector<std::string> quantized_op_types = { for (auto& op_type : {"conv2d", "mul", "depthwise_conv2d"}) {
"conv2d", "mul", "depthwise_conv2d"};
for (auto& op_type : quantized_op_types) {
fusion::DequantOpFuser fuser(op_type); fusion::DequantOpFuser fuser(op_type);
fuser(graph.get()); fuser(graph.get());
} }
for (auto& op_type : {"conv2d", "depthwise_conv2d"}) {
fusion::ChannelWiseDequantOpFuser fuser(op_type);
fuser(graph.get());
}
// delete quant_dequant_node // delete quant_dequant_node
for (auto op_type : {"pool2d", "elementwise_add"}) { for (auto op_type : {"pool2d", "elementwise_add"}) {
fusion::DeleteQuantDequantOpFuser fuser(op_type); fusion::DeleteQuantDequantOpFuser fuser(op_type);
......
...@@ -79,23 +79,26 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) { ...@@ -79,23 +79,26 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
void DequantOpFuser::BuildPattern() { void DequantOpFuser::BuildPattern() {
std::string weight_name = ""; std::string weight_name = "";
if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") { if (quantized_op_type_ == "conv2d" ||
quantized_op_type_ == "depthwise_conv2d") {
weight_name = "Filter"; weight_name = "Filter";
} else { } else {
weight_name = "Y"; weight_name = "Y";
} }
auto* quantized_op_input = auto* quantized_op_input = VarNode("quantized_op_input")
VarNode("quantized_op_input")->assert_is_op_input(op_type_)->AsInput(); ->assert_is_op_input(quantized_op_type_)
auto* quantized_op_weight = VarNode("quantized_op_weight") ->AsInput();
->assert_is_op_input(op_type_, weight_name) auto* quantized_op_weight =
->AsInput(); VarNode("quantized_op_weight")
auto* quantized_op = OpNode("quantized_op", op_type_) ->assert_is_op_input(quantized_op_type_, weight_name)
->assert_is_op(op_type_) ->AsInput();
auto* quantized_op = OpNode("quantized_op", quantized_op_type_)
->assert_is_op(quantized_op_type_)
->AsIntermediate(); ->AsIntermediate();
auto* quantized_op_out = auto* quantized_op_out =
VarNode("quantized_op_out") VarNode("quantized_op_out")
->assert_is_op_output(op_type_) ->assert_is_op_output(quantized_op_type_)
->assert_is_op_input("fake_dequantize_max_abs", "X") ->assert_is_op_input("fake_dequantize_max_abs", "X")
->AsIntermediate(); ->AsIntermediate();
auto* dequant_op = OpNode("dequant_op", "fake_dequantize_max_abs") auto* dequant_op = OpNode("dequant_op", "fake_dequantize_max_abs")
...@@ -110,12 +113,13 @@ void DequantOpFuser::BuildPattern() { ...@@ -110,12 +113,13 @@ void DequantOpFuser::BuildPattern() {
quantized_op_out->LinksFrom({quantized_op}); quantized_op_out->LinksFrom({quantized_op});
dequant_op->LinksFrom({quantized_op_out}); dequant_op->LinksFrom({quantized_op_out});
dequant_op_out->LinksFrom({dequant_op}); dequant_op_out->LinksFrom({dequant_op});
VLOG(4) << "DeQuantOpFuser BuildPattern op_type:" << op_type_;
VLOG(4) << "DeQuantOpFuser BuildPattern op_type:" << quantized_op_type_;
} }
void DequantOpFuser::InsertNewNode(SSAGraph* graph, void DequantOpFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) { const key2nodes_t& matched) {
auto* quant_op_input = matched.at("quantized_op_input"); auto* quantized_op_input = matched.at("quantized_op_input");
auto* quantized_op_weight = matched.at("quantized_op_weight"); auto* quantized_op_weight = matched.at("quantized_op_weight");
auto* quantized_op = matched.at("quantized_op"); auto* quantized_op = matched.at("quantized_op");
auto* dequant_op = matched.at("dequant_op"); auto* dequant_op = matched.at("dequant_op");
...@@ -142,14 +146,15 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -142,14 +146,15 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>(); scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
std::vector<float> weight_scale; std::vector<float> weight_scale;
int weight_scale_size; int weight_scale_size;
if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") { if (quantized_op_type_ == "conv2d" ||
op_desc.SetInput("Input", {quant_op_input->arg()->name}); quantized_op_type_ == "depthwise_conv2d") {
op_desc.SetInput("Input", {quantized_op_input->arg()->name});
op_desc.SetOutput("Output", {dequant_op_out->arg()->name}); op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
// Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
// be Cout. // be Cout.
weight_scale_size = quantized_weight_t->dims()[0]; weight_scale_size = quantized_weight_t->dims()[0];
} else if (op_type_ == "mul") { } else if (quantized_op_type_ == "mul") {
op_desc.SetInput("X", {quant_op_input->arg()->name}); op_desc.SetInput("X", {quantized_op_input->arg()->name});
op_desc.SetOutput("Out", {dequant_op_out->arg()->name}); op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
// Fc weight: Cin * Cout, the weight_scale_size should be Cout. // Fc weight: Cin * Cout, the weight_scale_size should be Cout.
weight_scale_size = quantized_weight_t->dims()[1]; weight_scale_size = quantized_weight_t->dims()[1];
...@@ -174,11 +179,11 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -174,11 +179,11 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
quantized_weight_t->set_precision(PRECISION(kInt8)); quantized_weight_t->set_precision(PRECISION(kInt8));
// new op and relink nodes // new op and relink nodes
auto new_quantized_op = LiteOpRegistry::Global().Create(op_type_); auto new_quantized_op = LiteOpRegistry::Global().Create(quantized_op_type_);
new_quantized_op->Attach(op_desc, scope); new_quantized_op->Attach(op_desc, scope);
auto* new_quantized_op_node = auto* new_quantized_op_node =
graph->GraphCreateInstructNode(new_quantized_op, valid_places); graph->GraphCreateInstructNode(new_quantized_op, valid_places);
IR_NODE_LINK_TO(quant_op_input, new_quantized_op_node); IR_NODE_LINK_TO(quantized_op_input, new_quantized_op_node);
IR_NODE_LINK_TO(quantized_op_weight, new_quantized_op_node); IR_NODE_LINK_TO(quantized_op_weight, new_quantized_op_node);
IR_NODE_LINK_TO(new_quantized_op_node, dequant_op_out); IR_NODE_LINK_TO(new_quantized_op_node, dequant_op_out);
} }
...@@ -188,6 +193,107 @@ cpp::OpDesc DequantOpFuser::GenOpDesc(const key2nodes_t& matched) { ...@@ -188,6 +193,107 @@ cpp::OpDesc DequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
return op_desc; return op_desc;
} }
void ChannelWiseDequantOpFuser::BuildPattern() {
std::string dequant_op_type = "fake_channel_wise_dequantize_max_abs";
auto* quantized_op_input = VarNode("quantized_op_input")
->assert_is_op_input(quantized_op_type_)
->AsInput();
auto* quantized_op_weight =
VarNode("quantized_op_weight")
->assert_is_op_input(quantized_op_type_, "Filter")
->AsInput();
auto* quantized_op = OpNode("quantized_op", quantized_op_type_)
->assert_is_op(quantized_op_type_)
->AsIntermediate();
auto* quantized_op_out = VarNode("quantized_op_out")
->assert_is_op_output(quantized_op_type_)
->assert_is_op_input(dequant_op_type, "X")
->AsIntermediate();
auto* dequant_op_channel_scale = VarNode("dequant_op_channel_scale")
->assert_is_op_input(dequant_op_type)
->AsIntermediate();
auto* dequant_op = OpNode("dequant_op", dequant_op_type)
->assert_is_op(dequant_op_type)
->AsIntermediate();
auto* dequant_op_out = VarNode("dequant_op_out")
->assert_is_op_output(dequant_op_type, "Out")
->AsOutput();
quantized_op->LinksFrom({quantized_op_input, quantized_op_weight});
quantized_op_out->LinksFrom({quantized_op});
dequant_op->LinksFrom({quantized_op_out, dequant_op_channel_scale});
dequant_op_out->LinksFrom({dequant_op});
VLOG(4) << "ChannelWiseDequantOpFuser BuildPattern op_type:"
<< quantized_op_type_;
}
void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto* quantized_op_input = matched.at("quantized_op_input");
auto* quantized_op_weight = matched.at("quantized_op_weight");
auto* quantized_op = matched.at("quantized_op");
auto* dequant_op_channel_scale = matched.at("dequant_op_channel_scale");
auto* dequant_op = matched.at("dequant_op");
auto* dequant_op_out = matched.at("dequant_op_out");
// obtain input_scale and weight_scale
auto* scope = quantized_op->stmt()->op()->scope();
auto& valid_places = quantized_op->stmt()->op()->valid_places();
float input_scale =
quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
std::vector<float> weight_scale;
std::vector<int> quant_bits =
dequant_op->stmt()->op_info()->GetAttr<std::vector<int>>("quant_bits");
int weight_bit_length = quant_bits[0];
int range = ((1 << (weight_bit_length - 1)) - 1);
auto channel_scale_name = dequant_op_channel_scale->arg()->name;
auto channel_scale_tensor =
scope->FindVar(channel_scale_name)->GetMutable<lite::Tensor>();
auto* channel_scale_data = channel_scale_tensor->data<float>();
for (int i = 0; i < channel_scale_tensor->data_size(); i++) {
weight_scale.push_back(channel_scale_data[i] / range);
}
// set op desc
cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
op_desc.SetInput("Input", {quantized_op_input->arg()->name});
op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
op_desc.SetAttr("enable_int8", true);
op_desc.SetAttr("input_scale", input_scale);
op_desc.SetAttr("weight_scale", weight_scale);
// change the weight from the float type to int8 type.
auto quantized_weight_var_name = quantized_op_weight->arg()->name;
auto quantized_weight_t =
scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
Tensor temp_tensor;
temp_tensor.CopyDataFrom(*quantized_weight_t);
float* temp_data = temp_tensor.mutable_data<float>();
int8_t* quantized_weight_data = quantized_weight_t->mutable_data<int8_t>();
for (size_t i = 0; i < quantized_weight_t->data_size(); i++) {
quantized_weight_data[i] = static_cast<int8_t>(temp_data[i]);
}
quantized_weight_t->set_persistable(true);
quantized_weight_t->set_precision(PRECISION(kInt8));
// new op and relink nodes
auto new_quantized_op = LiteOpRegistry::Global().Create(quantized_op_type_);
new_quantized_op->Attach(op_desc, scope);
auto* new_quantized_op_node =
graph->GraphCreateInstructNode(new_quantized_op, valid_places);
IR_NODE_LINK_TO(quantized_op_input, new_quantized_op_node);
IR_NODE_LINK_TO(quantized_op_weight, new_quantized_op_node);
IR_NODE_LINK_TO(new_quantized_op_node, dequant_op_out);
}
cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
cpp::OpDesc op_desc;
return op_desc;
}
void DeleteQuantDequantOpFuser::BuildPattern() { void DeleteQuantDequantOpFuser::BuildPattern() {
std::string quant_dequant_op_type = std::string quant_dequant_op_type =
"fake_quantize_dequantize_moving_average_abs_max"; "fake_quantize_dequantize_moving_average_abs_max";
......
...@@ -24,18 +24,21 @@ namespace mir { ...@@ -24,18 +24,21 @@ namespace mir {
namespace fusion { namespace fusion {
/* The model trained by fluid quantization is a simulation of real int8. /* The model trained by fluid quantization is a simulation of real int8.
* The quantized Ops(conv2d, mul, depthwise conv2d etc) have fake_quantop * The quantized Ops(conv2d, mul, depthwise conv2d etc) have fake_quant op
* in front and fake_dequantop behind. * in front and fake_dequant op behind.
* *
* When in int8 mode, the pattern like "fake_quant + quantized_op + * For int8 mode, the pattern like "fake_quant + quantized_op + fake_dequant"
* fake_dequant" * can be processed by the following three fuser. The fuser extract the
* can be detected by this fuser. The fuser extract the input_scale and * input_scale and the weight_scale info from fake_quant, fake_dequant op and
* the weight_scale info from fake_quant, fake_dequant op and fuse those into * fuse those into the quantized_op.
* the quantized_op.
* In addition, the fuser delete fake_quant and fake_dequant op in the graph at * In addition, the fuser delete fake_quant and fake_dequant op in the graph at
* the last. * the last.
*/ */
/* DeleteQuantOpFuser process
* fake_quantize_range_abs_max/fake_quantize_moving_average_abs_max
* + conv2d/mul/depthwise.
*/
class DeleteQuantOpFuser : public FuseBase { class DeleteQuantOpFuser : public FuseBase {
public: public:
explicit DeleteQuantOpFuser(const std::string& quant_op_type) explicit DeleteQuantOpFuser(const std::string& quant_op_type)
...@@ -50,9 +53,12 @@ class DeleteQuantOpFuser : public FuseBase { ...@@ -50,9 +53,12 @@ class DeleteQuantOpFuser : public FuseBase {
std::string quant_op_type_{}; std::string quant_op_type_{};
}; };
/* DequantOpFuser process conv2d/depthwise_conv2d/mul + fake_dequantize_max_abs.
*/
class DequantOpFuser : public FuseBase { class DequantOpFuser : public FuseBase {
public: public:
explicit DequantOpFuser(const std::string& op_type) : op_type_(op_type) {} explicit DequantOpFuser(const std::string& quantized_op_type)
: quantized_op_type_(quantized_op_type) {}
void BuildPattern() override; void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
...@@ -60,7 +66,24 @@ class DequantOpFuser : public FuseBase { ...@@ -60,7 +66,24 @@ class DequantOpFuser : public FuseBase {
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
private: private:
std::string op_type_{}; std::string quantized_op_type_{};
};
/* ChannelWiseDequantOpFuser process conv2d/depthwise_conv2d +
* fake_channel_wise_dequantize_max_abs.
*/
class ChannelWiseDequantOpFuser : public FuseBase {
public:
explicit ChannelWiseDequantOpFuser(const std::string& quantized_op_type)
: quantized_op_type_(quantized_op_type) {}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
private:
std::string quantized_op_type_{};
}; };
/* The pattern like "fake_quantize_dequantize_moving_average_abs_max + /* The pattern like "fake_quantize_dequantize_moving_average_abs_max +
......
...@@ -70,6 +70,7 @@ class StaticKernelPickPass : public mir::StmtPass { ...@@ -70,6 +70,7 @@ class StaticKernelPickPass : public mir::StmtPass {
const auto& place = places[i]; const auto& place = places[i];
float weight = static_cast<float>(place_size - i) / place_size; float weight = static_cast<float>(place_size - i) / place_size;
size_t score{}; size_t score{};
// The more important factor comes first // The more important factor comes first
if (kernel_pick_factors_.IsTargetConsidered() && if (kernel_pick_factors_.IsTargetConsidered() &&
(place.target == kernel.target() || kernel.target() == TARGET(kAny) || (place.target == kernel.target() || kernel.target() == TARGET(kAny) ||
...@@ -102,17 +103,17 @@ class StaticKernelPickPass : public mir::StmtPass { ...@@ -102,17 +103,17 @@ class StaticKernelPickPass : public mir::StmtPass {
VLOG(4) << "[score(final)]:" << final_score; VLOG(4) << "[score(final)]:" << final_score;
VLOG(4) << "-------- pick summary --------"; VLOG(4) << "-------- pick summary --------";
VLOG(4) << " ===> place():" << PrecisionToStr(winner_place.precision) << " " VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
<< DataLayoutToStr(winner_place.layout) << " " << " " << DataLayoutToStr(winner_place.layout) << " "
<< TargetToStr(winner_place.target); << TargetToStr(winner_place.target);
VLOG(4) << " ===> kernel.place():" VLOG(4) << " ===> kernel.place():"
<< PrecisionToStr(kernel.place().precision) << " " << PrecisionToStr(kernel.place().precision) << " "
<< DataLayoutToStr(kernel.place().layout) << " " << DataLayoutToStr(kernel.place().layout) << " "
<< TargetToStr(kernel.place().target); << TargetToStr(kernel.place().target);
VLOG(4) << "kernel.op_type():" << kernel.op_type(); VLOG(4) << "kernel.op_type():" << kernel.op_type();
VLOG(4) << "picker tactic " << kernel_pick_factors_; VLOG(4) << "kernel picker factors:" << kernel_pick_factors_;
VLOG(4) << "kernel place " << kernel.place().DebugString(); VLOG(4) << "kernel place:" << kernel.place().DebugString();
VLOG(4) << "picker place " << winner_place.DebugString(); VLOG(4) << "winner_picker place:" << winner_place.DebugString();
VLOG(4) << "------------------------------"; VLOG(4) << "------------------------------";
// The data layout is not considered, for the input and output arguments // The data layout is not considered, for the input and output arguments
......
...@@ -35,7 +35,7 @@ std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode( ...@@ -35,7 +35,7 @@ std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
lite::mir::Node* var_node, const Scope* scope) { lite::mir::Node* var_node, const Scope* scope) {
CHECK(var_node->IsArg()); CHECK(var_node->IsArg());
const auto& arg = var_node->AsArg(); const auto& arg = var_node->AsArg();
VLOG(4) << "Convert var node " << arg.name; VLOG(4) << "[NPU] Convert var node " << arg.name;
auto* var = scope->FindVar(arg.name); auto* var = scope->FindVar(arg.name);
CHECK(var); CHECK(var);
...@@ -44,13 +44,13 @@ std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode( ...@@ -44,13 +44,13 @@ std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
auto dims = tensor->dims(); auto dims = tensor->dims();
if (arg.is_weight) { if (arg.is_weight) {
auto wgt = std::make_shared<ge::op::Const>(arg.name); auto wgt = std::make_shared<ge::op::Const>(arg.name);
LOG(INFO) << "in convert const:" << arg.name; LOG(INFO) << "[NPU] Convert const var node " << arg.name;
VLOG(4) << dims; VLOG(4) << dims;
wgt->set_attr_value(lite::npu::CvtFromLiteTensor(tensor)); wgt->set_attr_value(lite::npu::CvtTensor(tensor));
return wgt; return wgt;
} else { } else {
CHECK_EQ(dims.size(), 4); CHECK_EQ(dims.size(), 4);
LOG(INFO) << "in convert data:" << arg.name; LOG(INFO) << "[NPU] Convert data var node " << arg.name;
LOG(INFO) << dims; LOG(INFO) << dims;
// TODO(xxx): support more types and dims size // TODO(xxx): support more types and dims size
ge::TensorDesc desc(ge::Shape(dims.Vectorize()), ge::TensorDesc desc(ge::Shape(dims.Vectorize()),
...@@ -128,10 +128,10 @@ std::string GenerateNPUProgramPass::BuildNPUGraph( ...@@ -128,10 +128,10 @@ std::string GenerateNPUProgramPass::BuildNPUGraph(
// persistable=true, Sothat the model parser can recognize it and save it to // persistable=true, Sothat the model parser can recognize it and save it to
// param files // param files
if (!lite::npu::BuildModel(inputs, outputs, weight)) { if (!lite::npu::BuildModel(inputs, outputs, weight)) {
LOG(WARNING) << "Build NPU failed subgraph " << sub_id; LOG(WARNING) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")";
throw std::runtime_error("Build NPU failed subgraph."); throw std::runtime_error("Build NPU graph failed.");
} }
LOG(INFO) << "[NPU] Build NPU Client success subgraph " << sub_id; LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")";
return weight_var_name; return weight_var_name;
} }
...@@ -166,12 +166,12 @@ void GenerateNPUProgramPass::GenNPUSubgraph( ...@@ -166,12 +166,12 @@ void GenerateNPUProgramPass::GenNPUSubgraph(
} }
void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) { void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
LOG(INFO) << "Before NPU Pass \n" << Visualize(graph.get()); LOG(INFO) << "[NPU] Before NPU Pass \n" << Visualize(graph.get());
const auto& bridges = lite::kernels::npu::bridges::Factory::Instance(); const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
const auto& op_map = bridges.AllFunctions(); const auto& op_map = bridges.AllFunctions();
std::vector<std::string> supported_op_types; std::vector<std::string> supported_op_types;
for (auto& i : op_map) { for (auto& i : op_map) {
LOG(INFO) << "Supported type: " << i.first; LOG(INFO) << "[NPU] Supported type: " << i.first;
supported_op_types.push_back(i.first); supported_op_types.push_back(i.first);
} }
...@@ -182,15 +182,15 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -182,15 +182,15 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
CHECK_EQ(op_nodes_all.size(), num_subgraph); CHECK_EQ(op_nodes_all.size(), num_subgraph);
int id = 1; int id = 1;
for (auto& op_nodes : op_nodes_all) { for (auto& op_nodes : op_nodes_all) {
LOG(INFO) << "Converting subgraph_id:" << id; LOG(INFO) << "[NPU] Converting Subgraph " << id;
GenNPUSubgraph(graph, op_nodes.second, id); GenNPUSubgraph(graph, op_nodes.second, id);
LOG(INFO) << "After NPU Pass Subgraph " << id << "\n" LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n"
<< Visualize(graph.get()); << Visualize(graph.get());
id++; id++;
} }
} catch (...) { } catch (...) {
LOG(WARNING) << "Build NPU graph failed"; LOG(WARNING) << "[NPU] Build NPU graph failed.";
throw std::runtime_error("Build NPU graph failed"); throw std::runtime_error("[NPU] Build NPU graph failed.");
} }
for (auto& item : graph->StmtTopologicalOrder()) { for (auto& item : graph->StmtTopologicalOrder()) {
...@@ -203,7 +203,7 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -203,7 +203,7 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
} }
std::unique_ptr<RuntimeProgram> GenerateNPUProgramPass::GenProgram() { std::unique_ptr<RuntimeProgram> GenerateNPUProgramPass::GenProgram() {
LOG(INFO) << "insts.size " << insts_.size(); LOG(INFO) << "[NPU] program insts.size " << insts_.size();
std::unique_ptr<RuntimeProgram> program( std::unique_ptr<RuntimeProgram> program(
new RuntimeProgram(std::move(insts_))); new RuntimeProgram(std::move(insts_)));
return program; return program;
......
...@@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst( ...@@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst(
for (auto& kernel : kernels) { for (auto& kernel : kernels) {
const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
#ifdef LITE_WITH_OPENCL
// layout kernel choose // layout kernel choose
// must ignore [layout check] for layout of kernels's input and output // must ignore [layout check] for layout of kernels's input and output
if (TargetCompatibleTo(*in_arg_ty, from) && // note: replace LITE_WITH_OPENCL macro with judge input and output target
PrecisionCompatibleTo(*in_arg_ty, from) && // of layout_trans
DeviceCompatibleTo(*in_arg_ty, from) && if ((in_arg_ty->target() == TARGET(kOpenCL) ||
out_arg_ty->layout() == to.layout()) { out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first
#else (TargetCompatibleTo(*in_arg_ty, from) &&
if (TypeCompatible(*in_arg_ty, from) && PrecisionCompatibleTo(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) { DeviceCompatibleTo(*in_arg_ty, from) &&
#endif out_arg_ty->layout() == to.layout())) {
is_found = true;
} else if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->layout() == to.layout()) {
is_found = true; is_found = true;
}
if (is_found) {
selected_kernels.emplace_back(std::move(kernel)); selected_kernels.emplace_back(std::move(kernel));
// we pick the kernel // we pick the kernel
layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op); layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op);
break; break;
} }
} }
CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":" CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":"
<< in->AsArg().name << "->" << to << ":" << in->AsArg().name << "->" << to << ":"
<< inst_node->AsStmt().op_info()->Type(); << inst_node->AsStmt().op_info()->Type();
......
...@@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst(
VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty; VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty;
VLOG(4) << "to:" << to << "\n"; VLOG(4) << "to:" << to << "\n";
// kernel choose branch for opencl backend // kernel choose branch for opencl backend
// judge inst's target whether is kOpenCL // judge inst's target whether is kOpenCL
// Note: to == *decl_arg_type == in of inst, not output of last inst // Note: to == *decl_arg_type == in of inst, not output of last inst
#ifdef LITE_WITH_OPENCL
// ignore [layout check] for layout between [to] and [from] // ignore [layout check] for layout between [to] and [from]
// Because all of origin opencl insts in model, are not default layout // Because all of origin opencl insts in model, are not default layout
// NCHW, // NCHW,
...@@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst(
// [*decl_arg_type] -> [to]: input of inst, not output of last // [*decl_arg_type] -> [to]: input of inst, not output of last
// [in_arg_ty]: in of io_copy // [in_arg_ty]: in of io_copy
// [out_arg_ty]: out of io_copy // [out_arg_ty]: out of io_copy
if (TargetCompatibleTo(*in_arg_ty, from) && //
PrecisionCompatibleTo(*in_arg_ty, from) && // noto: replace LITE_WITH_OPENCL macro with judge input and output target
DeviceCompatibleTo(*in_arg_ty, from) && // of io_copy
TargetCompatibleTo(*out_arg_ty, to)) { if ((in_arg_ty->target() == TARGET(kOpenCL) ||
VLOG(4) << "do nothing. opencl found"; out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first
#else (TargetCompatibleTo(*in_arg_ty, from) &&
if (TypeCompatible(*in_arg_ty, from) && PrecisionCompatibleTo(*in_arg_ty, from) &&
out_arg_ty->target() == to.target()) { DeviceCompatibleTo(*in_arg_ty, from) &&
#endif TargetCompatibleTo(*out_arg_ty, to))) {
VLOG(4) << "picked, opencl found";
is_found = true;
} else if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->target() == to.target()) {
VLOG(4) << "picked"; VLOG(4) << "picked";
is_found = true; is_found = true;
}
if (is_found) {
selected_kernels.emplace_back(std::move(kernel)); selected_kernels.emplace_back(std::move(kernel));
// we pick the kernel // we pick the kernel
io_copy_inst->AsStmt( io_copy_inst->AsStmt(
io_copy_type, std::move(selected_kernels), io_copy_op); io_copy_type, std::move(selected_kernels), io_copy_op);
break; break;
} }
VLOG(4) << "not picked"; VLOG(4) << "not picked";
} }
CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from
<< ":" << in->AsArg().name << " -> " << to << ":" << ":" << in->AsArg().name << " -> " << to << ":"
<< inst_node->AsStmt().op_info()->Type(); << inst_node->AsStmt().op_info()->Type();
......
...@@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass {
} }
} }
// Set the tye of the weight // Set the type of the weight
void SetWeightType(Node* w, const LiteType& type) { void SetWeightType(Node* w,
// TODO(xg) to optimize this const LiteType& type,
#ifdef LITE_WITH_FPGA const std::map<std::string, bool>& lite_with_targets) {
w->AsArg().type = LiteType::GetTensorTy( VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); if (lite_with_targets.at("kFPGA")) {
#endif w->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
#ifdef LITE_WITH_OPENCL } else if (lite_with_targets.at("kOpenCL")) {
w->AsArg().type = LiteType::GetTensorTy( w->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
#endif } else {
w->AsArg().type = LiteType::GetTensorTy(
#ifndef LITE_WITH_FPGA TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
#ifndef LITE_WITH_OPENCL }
w->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
#endif
#endif
} }
void InferenceArgumentPlace(SSAGraph* graph) { void InferenceArgumentPlace(SSAGraph* graph) {
auto& valid_places = graph->valid_places();
auto valid_places_has_target = [&](TargetType t) -> bool {
for (auto& p : valid_places) {
if (p.target == t) {
return true;
}
}
return false;
};
std::map<std::string, bool> lite_with_targets{
{"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
{"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global(); VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
for (auto& x : graph->StmtTopologicalOrder()) { for (auto& x : graph->StmtTopologicalOrder()) {
auto& inst = x->AsStmt(); auto& inst = x->AsStmt();
// The IoCopyOp is a tool operator, it won't support the type inference. // The IoCopyOp is a tool operator, it won't support the type inference.
// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
// tool operator // for
#ifndef LITE_WITH_FPGA // tool operator
#ifndef LITE_WITH_OPENCL if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
VLOG(3) << "inst.op_type() == 'io_copy', continue"; VLOG(3) << "inst.op_type() == 'io_copy', continue";
if (inst.op_type() == "io_copy") continue; if (inst.op_type() == "io_copy") continue;
#endif }
#endif
// deal with inputs // deal with inputs
VLOG(4) << "Infering op " << inst.op_info()->Repr(); VLOG(4) << "Infering op " << inst.op_info()->Repr();
// TODO(zhaolong): Add check if the node's name in op's arguments. // TODO(zhaolong): Add check if the node's name in op's arguments.
...@@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass {
if (!x_in->AsArg().type) { if (!x_in->AsArg().type) {
VLOG(4) << "set type " << *type << " " << x_in->AsArg().name; VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
if (x_in->AsArg().is_weight) { if (x_in->AsArg().is_weight) {
SetWeightType(x_in, *type); SetWeightType(x_in, *type, lite_with_targets);
} else { } else {
x_in->AsArg().type = type; x_in->AsArg().type = type;
} }
...@@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass {
if (!x_out->AsArg().type) { if (!x_out->AsArg().type) {
VLOG(4) << "set type " << *type << " " << x_out->AsArg().name; VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
if (x_out->AsArg().is_weight) { if (x_out->AsArg().is_weight) {
SetWeightType(x_out, *type); SetWeightType(x_out, *type, lite_with_targets);
} else { } else {
x_out->AsArg().type = type; x_out->AsArg().type = type;
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
...@@ -49,6 +50,22 @@ class Optimizer { ...@@ -49,6 +50,22 @@ class Optimizer {
valid_places_ = valid_places; valid_places_ = valid_places;
CHECK(!valid_places.empty()) << "At least one valid_place should be set"; CHECK(!valid_places.empty()) << "At least one valid_place should be set";
CHECK(!graph_) << "duplicate optimize found"; CHECK(!graph_) << "duplicate optimize found";
auto valid_places_has_target = [&](TargetType t) -> bool {
for (auto& p : valid_places) {
if (p.target == t) {
return true;
}
}
return false;
};
std::map<std::string, bool> lite_with_targets{
{"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
{"kNPU", valid_places_has_target(TARGET(kNPU))},
{"kXPU", valid_places_has_target(TARGET(kXPU))}};
VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
VLOG(4) << "lite_with_targets['kNPU']:" << lite_with_targets["kNPU"];
VLOG(4) << "lite_with_targets['kXPU']:" << lite_with_targets["kXPU"];
graph_.reset(new mir::SSAGraph); graph_.reset(new mir::SSAGraph);
graph_->Build(program, valid_places); graph_->Build(program, valid_places);
graph_->SetValidPlaces(valid_places); graph_->SetValidPlaces(valid_places);
...@@ -57,14 +74,11 @@ class Optimizer { ...@@ -57,14 +74,11 @@ class Optimizer {
InitTargetTypeTransformPass(); InitTargetTypeTransformPass();
if (passes.empty()) { if (passes.empty()) {
RunPasses(std::vector<std::string>{ std::vector<std::string> passes_local{
{"lite_quant_dequant_fuse_pass", // {"lite_quant_dequant_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-elemwise-bn "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn
"lite_conv_bn_fuse_pass", // "lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_fuse_pass", // conv-bn-elemwise "lite_conv_elementwise_fuse_pass", // conv-bn-elemwise
// This pass is disabled to force some opencl kernels selected for
// final running, otherwise, they will be fused to ARM fusion
// kernels, and the OpenCL devices will be discarded.
// TODO(Superjomn) Refine the fusion related design to select fusion // TODO(Superjomn) Refine the fusion related design to select fusion
// kernels for devices automatically. // kernels for devices automatically.
"lite_conv_activation_fuse_pass", // "lite_conv_activation_fuse_pass", //
...@@ -105,16 +119,17 @@ class Optimizer { ...@@ -105,16 +119,17 @@ class Optimizer {
"argument_type_display_pass", // "argument_type_display_pass", //
"variable_place_inference_pass", // "variable_place_inference_pass", //
"argument_type_display_pass", // "argument_type_display_pass",
"runtime_context_assign_pass", "runtime_context_assign_pass",
"argument_type_display_pass", // "argument_type_display_pass"}};
#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \ if ((!lite_with_targets["kOpenCL"]) && (!lite_with_targets["kNPU"]) &&
!defined(LITE_WITH_XPU) (!lite_with_targets["kXPU"])) {
// TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in OpenCL
"memory_optimize_pass", // kernel
#endif passes_local.emplace_back("memory_optimize_pass");
"argument_type_display_pass"}}); }
RunPasses(passes_local);
} else { } else {
RunPasses(passes); RunPasses(passes);
} }
...@@ -141,6 +156,7 @@ class Optimizer { ...@@ -141,6 +156,7 @@ class Optimizer {
.LookUp<mir::subgraph::GenerateNPUProgramPass>( .LookUp<mir::subgraph::GenerateNPUProgramPass>(
"generate_npu_program_pass"); "generate_npu_program_pass");
#endif #endif
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XPU
auto pass = mir::PassManager::Global() auto pass = mir::PassManager::Global()
.LookUp<mir::subgraph::GenerateXPUProgramPass>( .LookUp<mir::subgraph::GenerateXPUProgramPass>(
......
...@@ -32,11 +32,21 @@ int64_t ShapeProduction(const shape_t& shape) { ...@@ -32,11 +32,21 @@ int64_t ShapeProduction(const shape_t& shape) {
return res; return res;
} }
// 0. Enable OpenCL, if needed
// Enable `DEMO_WITH_OPENCL` macro below, if user need use gpu(opencl)
// #define DEMO_WITH_OPENCL
void RunModel() { void RunModel() {
// 1. Set CxxConfig // 1. Set CxxConfig
CxxConfig config; CxxConfig config;
config.set_model_dir(FLAGS_model_dir); config.set_model_dir(FLAGS_model_dir);
#ifdef DEMO_WITH_OPENCL
std::vector<Place> valid_places{
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
Place{TARGET(kARM), PRECISION(kFloat)}};
#else
std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}}; std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
#endif
if (FLAGS_prefer_int8_kernel) { if (FLAGS_prefer_int8_kernel) {
valid_places.insert(valid_places.begin(), valid_places.insert(valid_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)}); Place{TARGET(kARM), PRECISION(kInt8)});
......
...@@ -100,5 +100,9 @@ lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_ ...@@ -100,5 +100,9 @@ lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_
lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra) lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra)
lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm) lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm)
lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm) lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm)
lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm) lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm)
if(LITE_BUILD_EXTRA)
lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm)
lite_cc_test(test_lookup_table_compute_arm SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_arm)
endif()
...@@ -49,10 +49,7 @@ void CastCompute::Run() { ...@@ -49,10 +49,7 @@ void CastCompute::Run() {
const int32_t* x_data_begin = param.X->data<int32_t>(); const int32_t* x_data_begin = param.X->data<int32_t>();
const int32_t* x_data_end = x_data_begin + param.X->numel(); const int32_t* x_data_end = x_data_begin + param.X->numel();
float* out_data = param.Out->mutable_data<float>(); float* out_data = param.Out->mutable_data<float>();
// std::transform(x_data_begin, x_data_end, out_data, TransOp<int32_t, std::transform(x_data_begin, x_data_end, out_data, TransOp<int32_t, float>);
// float>);
// todo: the input type actually is float.
memcpy(out_data, x_data_begin, sizeof(float) * param.X->numel());
} else if (param.in_dtype == 20 && param.out_dtype == 5) { // uint8->float32 } else if (param.in_dtype == 20 && param.out_dtype == 5) { // uint8->float32
const unsigned char* x_data_begin = param.X->data<unsigned char>(); const unsigned char* x_data_begin = param.X->data<unsigned char>();
const unsigned char* x_data_end = x_data_begin + param.X->numel(); const unsigned char* x_data_end = x_data_begin + param.X->numel();
......
...@@ -39,6 +39,11 @@ void ConcatCompute::Run() { ...@@ -39,6 +39,11 @@ void ConcatCompute::Run() {
std::vector<lite::Tensor*> inputs = param.x; std::vector<lite::Tensor*> inputs = param.x;
auto* out = param.output; auto* out = param.output;
int axis = param.axis; int axis = param.axis;
auto* axis_tensor = param.axis_tensor;
if (axis_tensor != nullptr) {
auto* axis_tensor_data = axis_tensor->data<int>();
axis = axis_tensor_data[0];
}
out->mutable_data<float>(); out->mutable_data<float>();
/// Sometimes direct copies will be faster, this maybe need deeply analysis. /// Sometimes direct copies will be faster, this maybe need deeply analysis.
...@@ -83,5 +88,7 @@ void ConcatCompute::Run() { ...@@ -83,5 +88,7 @@ void ConcatCompute::Run() {
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
concat, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def) concat, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
...@@ -20,15 +20,10 @@ namespace kernels { ...@@ -20,15 +20,10 @@ namespace kernels {
namespace arm { namespace arm {
template <> template <>
void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() { void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
auto& param = this->template Param<param_t>(); auto& param = this->Param<param_t>();
auto x_dims = param.x->dims();
auto w_dims = param.filter->dims();
auto o_dims = param.output->dims();
if (last_shape_ == x_dims) {
return;
}
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
// extend workspace
if (param.strides[0] == 2) { if (param.strides[0] == 2) {
ctx.ExtendWorkspace( ctx.ExtendWorkspace(
lite::arm::math::conv3x3s2_direct_workspace_size(param, &ctx)); lite::arm::math::conv3x3s2_direct_workspace_size(param, &ctx));
...@@ -36,12 +31,7 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() { ...@@ -36,12 +31,7 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
ctx.ExtendWorkspace( ctx.ExtendWorkspace(
lite::arm::math::conv3x3s1_direct_workspace_size(param, &ctx)); lite::arm::math::conv3x3s1_direct_workspace_size(param, &ctx));
} }
}
template <>
void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>();
const auto* i_data = param.x->data<float>(); const auto* i_data = param.x->data<float>();
const auto* w_data = weights_.data<float>(); const auto* w_data = weights_.data<float>();
const auto* b_data = param.bias ? param.bias->data<float>() : nullptr; const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
...@@ -89,9 +79,6 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { ...@@ -89,9 +79,6 @@ void DirectConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
} }
} }
template <>
void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::ReInitWhenNeeded() {}
template <> template <>
void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
...@@ -148,9 +135,6 @@ void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { ...@@ -148,9 +135,6 @@ void DirectConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
} }
} }
template <>
void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::ReInitWhenNeeded() {}
template <> template <>
void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() { void DirectConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
......
...@@ -156,7 +156,6 @@ class DirectConv : public KernelLite<TARGET(kARM), Ptype> { ...@@ -156,7 +156,6 @@ class DirectConv : public KernelLite<TARGET(kARM), Ptype> {
auto x_dims = param.x->dims(); auto x_dims = param.x->dims();
auto w_dims = param.filter->dims(); auto w_dims = param.filter->dims();
auto o_dims = param.output->dims(); auto o_dims = param.output->dims();
last_shape_ = x_dims;
int ic = x_dims[1]; int ic = x_dims[1];
int oc = o_dims[1]; int oc = o_dims[1];
...@@ -179,12 +178,10 @@ class DirectConv : public KernelLite<TARGET(kARM), Ptype> { ...@@ -179,12 +178,10 @@ class DirectConv : public KernelLite<TARGET(kARM), Ptype> {
w_scale_); w_scale_);
} }
virtual void ReInitWhenNeeded();
virtual void Run(); virtual void Run();
/// todo, support inplace weights transform /// todo, support inplace weights transform
protected: protected:
DDim last_shape_;
Tensor weights_; Tensor weights_;
Tensor bias_; Tensor bias_;
bool flag_trans_weights_{false}; bool flag_trans_weights_{false};
......
...@@ -85,6 +85,7 @@ template <> ...@@ -85,6 +85,7 @@ template <>
void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { void GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
ctx.ExtendWorkspace(workspace_size_);
auto weights = param.filter->data<float>(); auto weights = param.filter->data<float>();
if (flag_trans_weights_) { if (flag_trans_weights_) {
weights = weights_.data<float>(); weights = weights_.data<float>();
...@@ -120,6 +121,7 @@ template <> ...@@ -120,6 +121,7 @@ template <>
void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() { void GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
ctx.ExtendWorkspace(workspace_size_);
auto weights = param.filter->data<int8_t>(); auto weights = param.filter->data<int8_t>();
if (flag_trans_weights_) { if (flag_trans_weights_) {
weights = weights_.data<int8_t>(); weights = weights_.data<int8_t>();
...@@ -179,6 +181,7 @@ template <> ...@@ -179,6 +181,7 @@ template <>
void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() { void GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
ctx.ExtendWorkspace(workspace_size_);
auto weights = param.filter->data<int8_t>(); auto weights = param.filter->data<int8_t>();
if (flag_trans_weights_) { if (flag_trans_weights_) {
weights = weights_.data<int8_t>(); weights = weights_.data<int8_t>();
......
...@@ -72,7 +72,7 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> { ...@@ -72,7 +72,7 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
} else { } else {
//! im2col gemmlike conv //! im2col gemmlike conv
flag_1x1gemm_ = false; flag_1x1gemm_ = false;
ctx.ExtendWorkspace(k * n * sizeof(float)); workspace_size_ = k * n * sizeof(float);
} }
if (!flag_trans_weights_ && n > 1) { if (!flag_trans_weights_ && n > 1) {
lite::arm::math::trans_gemm_weights<Ptype>( lite::arm::math::trans_gemm_weights<Ptype>(
...@@ -97,6 +97,7 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> { ...@@ -97,6 +97,7 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
bool flag_trans_bias_{false}; bool flag_trans_bias_{false};
Tensor weights_; Tensor weights_;
Tensor bias_; Tensor bias_;
int workspace_size_{0};
}; };
} // namespace arm } // namespace arm
......
...@@ -40,13 +40,13 @@ void Conv2DTransposeCompute::PrepareForRun() { ...@@ -40,13 +40,13 @@ void Conv2DTransposeCompute::PrepareForRun() {
int group = param.groups; int group = param.groups;
// deconv weights layout: chin * chout * kh * kw // deconv weights layout: chin * chout * kh * kw
auto& ctx = this->ctx_->template As<ARMContext>();
int m = chout * kw * kh / group; int m = chout * kw * kh / group;
int n = hin * win; int n = hin * win;
int k = chin / group; int k = chin / group;
ctx.ExtendWorkspace(group * m * n * sizeof(float)); workspace_size_ = group * m * n * sizeof(float);
auto& ctx = this->ctx_->template As<ARMContext>();
lite::Tensor tmp_weights; lite::Tensor tmp_weights;
lite::arm::math::prepackA( lite::arm::math::prepackA(
&tmp_weights, *(param.filter), 1.f, m, k, group, true, &ctx); &tmp_weights, *(param.filter), 1.f, m, k, group, true, &ctx);
...@@ -57,6 +57,8 @@ void Conv2DTransposeCompute::PrepareForRun() { ...@@ -57,6 +57,8 @@ void Conv2DTransposeCompute::PrepareForRun() {
} }
void Conv2DTransposeCompute::Run() { void Conv2DTransposeCompute::Run() {
auto& ctx = this->ctx_->template As<ARMContext>();
ctx.ExtendWorkspace(workspace_size_);
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto x_dims = param.x->dims(); auto x_dims = param.x->dims();
auto o_dims = param.output->dims(); auto o_dims = param.output->dims();
...@@ -80,7 +82,6 @@ void Conv2DTransposeCompute::Run() { ...@@ -80,7 +82,6 @@ void Conv2DTransposeCompute::Run() {
int group_size_in = win * hin * chin / group; int group_size_in = win * hin * chin / group;
int group_size_out = wout * hout * chout / group; int group_size_out = wout * hout * chout / group;
int group_size_coldata = m * n; int group_size_coldata = m * n;
auto& ctx = this->ctx_->template As<ARMContext>();
int hblock = lite::arm::math::get_hblock(&ctx); int hblock = lite::arm::math::get_hblock(&ctx);
int m_roundup = hblock * ((m + hblock - 1) / hblock); int m_roundup = hblock * ((m + hblock - 1) / hblock);
int group_size_weights = ((m_roundup * k + 15) / 16) * 16; int group_size_weights = ((m_roundup * k + 15) / 16) * 16;
......
...@@ -32,6 +32,9 @@ class Conv2DTransposeCompute ...@@ -32,6 +32,9 @@ class Conv2DTransposeCompute
void Run() override; void Run() override;
~Conv2DTransposeCompute() = default; ~Conv2DTransposeCompute() = default;
protected:
int workspace_size_{0};
}; };
} // namespace arm } // namespace arm
......
...@@ -46,8 +46,7 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() { ...@@ -46,8 +46,7 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
int max_ch = ic > oc ? ic : oc; int max_ch = ic > oc ? ic : oc;
const int n_wino = size_tile; const int n_wino = size_tile;
ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) * workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float);
sizeof(float));
last_shape_ = x_dims; last_shape_ = x_dims;
} }
...@@ -76,8 +75,7 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() { ...@@ -76,8 +75,7 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
int hblock = lite::arm::math::get_hblock(&ctx); int hblock = lite::arm::math::get_hblock(&ctx);
int m_round = hblock * ((m_wino + hblock - 1) / hblock); int m_round = hblock * ((m_wino + hblock - 1) / hblock);
weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic}); weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic});
ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) * workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float);
sizeof(float));
auto weights_wino = auto weights_wino =
static_cast<float*>(malloc(sizeof(float) * 8 * 8 * oc * ic)); static_cast<float*>(malloc(sizeof(float) * 8 * 8 * oc * ic));
void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic); void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
...@@ -106,6 +104,9 @@ template <> ...@@ -106,6 +104,9 @@ template <>
void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() { void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
// extend workspace
ctx.ExtendWorkspace(workspace_size_);
const auto* i_data = param.x->data<float>(); const auto* i_data = param.x->data<float>();
const auto* w_data = weights_.data<float>(); const auto* w_data = weights_.data<float>();
const auto* b_data = param.bias ? param.bias->data<float>() : nullptr; const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
......
...@@ -39,6 +39,7 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> { ...@@ -39,6 +39,7 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> {
using param_t = operators::ConvParam; using param_t = operators::ConvParam;
Tensor weights_; Tensor weights_;
DDim last_shape_; DDim last_shape_;
int workspace_size_{0};
}; };
} // namespace arm } // namespace arm
......
...@@ -127,7 +127,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() { ...@@ -127,7 +127,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
k_, k_,
param.bias != nullptr, param.bias != nullptr,
b_data, b_data,
false); false,
&ctx);
} }
} }
} }
......
...@@ -29,9 +29,25 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> { ...@@ -29,9 +29,25 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
auto& param = *param_.get_mutable<param_t>(); auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<ARMContext>(); auto& context = ctx_->As<ARMContext>();
auto data = param.Out->template mutable_data<T>(); if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
for (int i = 0; i < param.Out->numel(); i++) { auto data = param.Out->template mutable_data<float>();
data[i] = param.value; for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT32)) {
auto data = param.Out->template mutable_data<int32_t>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT8)) {
auto data = param.Out->template mutable_data<int8_t>();
for (int i = 0; i < param.Out->numel(); i++) {
data[i] = param.value;
}
} else {
LOG(FATAL) << "not supported dtype " << param.dtype;
} }
} }
...@@ -54,9 +70,25 @@ class FillConstantBatchLikeCompute ...@@ -54,9 +70,25 @@ class FillConstantBatchLikeCompute
param.out->Resize(odims); param.out->Resize(odims);
} }
auto data = param.out->template mutable_data<T>(); if (param.dtype == static_cast<int32_t>(lite::core::FluidType::FP32)) {
for (int i = 0; i < param.out->numel(); i++) { auto data = param.out->template mutable_data<float>();
data[i] = param.value; for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT32)) {
auto data = param.out->template mutable_data<int32_t>();
for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value;
}
} else if (param.dtype ==
static_cast<int32_t>(lite::core::FluidType::INT8)) {
auto data = param.out->template mutable_data<int8_t>();
for (int i = 0; i < param.out->numel(); i++) {
data[i] = param.value;
}
} else {
LOG(FATAL) << "not supported dtype " << param.dtype;
} }
} }
......
...@@ -28,6 +28,8 @@ void BilinearInterpCompute::Run() { ...@@ -28,6 +28,8 @@ void BilinearInterpCompute::Run() {
auto& param = Param<operators::InterpolateParam>(); auto& param = Param<operators::InterpolateParam>();
lite::Tensor* X = param.X; lite::Tensor* X = param.X;
lite::Tensor* OutSize = param.OutSize; lite::Tensor* OutSize = param.OutSize;
auto SizeTensor = param.SizeTensor;
auto Scale = param.Scale;
lite::Tensor* Out = param.Out; lite::Tensor* Out = param.Out;
float scale = param.scale; float scale = param.scale;
int out_w = param.out_w; int out_w = param.out_w;
...@@ -36,11 +38,12 @@ void BilinearInterpCompute::Run() { ...@@ -36,11 +38,12 @@ void BilinearInterpCompute::Run() {
std::string interp_method = "Bilinear"; std::string interp_method = "Bilinear";
lite::arm::math::interpolate(X, lite::arm::math::interpolate(X,
OutSize, OutSize,
SizeTensor,
Scale,
Out, Out,
out_h, out_h,
out_w, out_w,
scale, scale,
scale,
align_corners, align_corners,
interp_method); interp_method);
} }
...@@ -49,6 +52,8 @@ void NearestInterpCompute::Run() { ...@@ -49,6 +52,8 @@ void NearestInterpCompute::Run() {
auto& param = Param<operators::InterpolateParam>(); auto& param = Param<operators::InterpolateParam>();
lite::Tensor* X = param.X; lite::Tensor* X = param.X;
lite::Tensor* OutSize = param.OutSize; lite::Tensor* OutSize = param.OutSize;
auto SizeTensor = param.SizeTensor;
auto Scale = param.Scale;
lite::Tensor* Out = param.Out; lite::Tensor* Out = param.Out;
float scale = param.scale; float scale = param.scale;
int out_w = param.out_w; int out_w = param.out_w;
...@@ -57,11 +62,12 @@ void NearestInterpCompute::Run() { ...@@ -57,11 +62,12 @@ void NearestInterpCompute::Run() {
std::string interp_method = "Nearest"; std::string interp_method = "Nearest";
lite::arm::math::interpolate(X, lite::arm::math::interpolate(X,
OutSize, OutSize,
SizeTensor,
Scale,
Out, Out,
out_h, out_h,
out_w, out_w,
scale, scale,
scale,
align_corners, align_corners,
interp_method); interp_method);
} }
...@@ -79,6 +85,8 @@ REGISTER_LITE_KERNEL(bilinear_interp, ...@@ -79,6 +85,8 @@ REGISTER_LITE_KERNEL(bilinear_interp,
def) def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
...@@ -90,5 +98,7 @@ REGISTER_LITE_KERNEL(nearest_interp, ...@@ -90,5 +98,7 @@ REGISTER_LITE_KERNEL(nearest_interp,
def) def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
...@@ -28,7 +28,6 @@ namespace arm { ...@@ -28,7 +28,6 @@ namespace arm {
void LookupTableCompute::Run() { void LookupTableCompute::Run() {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<ARMContext>();
// inputs // inputs
auto w = param.W; auto w = param.W;
auto ids = param.Ids; auto ids = param.Ids;
...@@ -37,7 +36,7 @@ void LookupTableCompute::Run() { ...@@ -37,7 +36,7 @@ void LookupTableCompute::Run() {
auto table_dim = w->dims(); auto table_dim = w->dims();
int64_t ids_numel = ids->numel(); int64_t ids_numel = ids->numel();
auto ids_data = ids->data<float>(); auto ids_data = ids->data<int64_t>();
int64_t row_number = table_dim[0]; int64_t row_number = table_dim[0];
int64_t row_width = table_dim[1]; int64_t row_width = table_dim[1];
...@@ -76,3 +75,14 @@ REGISTER_LITE_KERNEL(lookup_table, ...@@ -76,3 +75,14 @@ REGISTER_LITE_KERNEL(lookup_table,
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(lookup_table_v2,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::LookupTableCompute,
def)
.BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/arm/lookup_table_compute.h"
#include <gtest/gtest.h>
#include <cmath>
#include <string>
#include <vector>
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
void lookup_table_compute_ref(const operators::LookupTableParam &param) {
auto *ids_t = param.Ids;
auto *output_t = param.Out;
int64_t padding_idx = param.padding_idx;
auto *ids = ids_t->data<int64_t>();
int64_t ids_numel = ids_t->dims().production();
auto *table_t = param.W;
int64_t row_number = table_t->dims()[0];
int64_t row_width = table_t->dims()[1];
auto *table = table_t->data<float>();
auto *output = output_t->mutable_data<float>();
memset(output, 0, output_t->dims().production() * sizeof(float));
for (int64_t i = 0; i < ids_numel; ++i) {
if (padding_idx != -1 && ids[i] == padding_idx) {
memset(output + i * row_width, 0, row_width * sizeof(float));
} else {
CHECK_LT(ids[i], row_number);
CHECK_GE(ids[i], 0);
memcpy(output + i * row_width,
table + ids[i] * row_width,
row_width * sizeof(float));
}
}
}
TEST(lookup_table_arm, retrieve_op) {
auto lookup_table =
KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
"lookup_table");
ASSERT_FALSE(lookup_table.empty());
ASSERT_TRUE(lookup_table.front());
}
TEST(lookup_table_arm, init) {
LookupTableCompute lookup_table;
ASSERT_EQ(lookup_table.precision(), PRECISION(kFloat));
ASSERT_EQ(lookup_table.target(), TARGET(kARM));
}
TEST(lookup_table_arm, compute) {
LookupTableCompute lookup_table;
operators::LookupTableParam param;
lite::Tensor w, ids, out, out_ref;
int64_t padding_idx = -1;
auto w_dim = DDim(std::vector<int64_t>({4, 5}));
auto ids_dim = DDim(std::vector<int64_t>({3, 2}));
auto out_dim = DDim(std::vector<int64_t>({3, 2, 5}));
w.Resize(w_dim);
ids.Resize(ids_dim);
out.Resize(out_dim);
out_ref.Resize(out_dim);
auto *w_data = w.mutable_data<float>();
auto *ids_data = ids.mutable_data<int64_t>();
auto *out_data = out.mutable_data<float>();
auto *out_ref_data = out_ref.mutable_data<float>();
int w_num = w_dim.production();
for (int i = 0; i < w_num; i++) {
w_data[i] = static_cast<float>(i + 1) / (w_num + 1);
}
int ids_num = ids_dim.production();
for (int i = 0; i < ids_num; i++) {
ids_data[i] = i % 4;
}
int out_num = out_dim.production();
param.W = &w;
param.Ids = &ids;
param.Out = &out;
lookup_table.SetParam(param);
lookup_table.Run();
param.Out = &out_ref;
lookup_table_compute_ref(param);
for (int i = 0; i < out_num; i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def);
...@@ -31,16 +31,16 @@ void LrnCompute::Run() { ...@@ -31,16 +31,16 @@ void LrnCompute::Run() {
int channel = x_dims[1]; int channel = x_dims[1];
int h = x_dims[2]; int h = x_dims[2];
int w = x_dims[3]; int w = x_dims[3];
const int local_size = param.local_size; const int n = param.n;
const float alpha = param.alpha; const float alpha = param.alpha;
const float beta = param.beta; const float beta = param.beta;
const float k = param.k; const float k = param.k;
if (param.norm_region == "AcrossChannels") { if (param.norm_region == "AcrossChannels") {
lite::arm::math::compute_across_channels( lite::arm::math::compute_across_channels(
x_data, out_data, num, channel, h, w, local_size, alpha, beta, k); x_data, out_data, num, channel, h, w, n, alpha, beta, k);
} else { } else {
lite::arm::math::compute_within_channels( lite::arm::math::compute_within_channels(
x_data, out_data, num, channel, h, w, local_size, alpha, beta, k); x_data, out_data, num, channel, h, w, n, alpha, beta, k);
} }
} }
...@@ -53,4 +53,5 @@ REGISTER_LITE_KERNEL( ...@@ -53,4 +53,5 @@ REGISTER_LITE_KERNEL(
lrn, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::LrnCompute, def) lrn, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::LrnCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("MidOut", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
...@@ -91,7 +91,7 @@ void lrn_compute_ref(const operators::LrnParam& param) { ...@@ -91,7 +91,7 @@ void lrn_compute_ref(const operators::LrnParam& param) {
const dtype* x_data = param.X->data<const dtype>(); const dtype* x_data = param.X->data<const dtype>();
dtype* out_data = param.Out->mutable_data<dtype>(); dtype* out_data = param.Out->mutable_data<dtype>();
auto x_dims = param.X->dims(); auto x_dims = param.X->dims();
int local_size = param.local_size; int local_size = param.n;
float alpha = param.alpha; float alpha = param.alpha;
float beta = param.beta; float beta = param.beta;
float k = param.k; float k = param.k;
...@@ -171,7 +171,7 @@ TEST(lrn_arm, compute) { ...@@ -171,7 +171,7 @@ TEST(lrn_arm, compute) {
} }
param.X = &x; param.X = &x;
param.Out = &output; param.Out = &output;
param.local_size = local_size; param.n = local_size;
param.alpha = alpha; param.alpha = alpha;
param.beta = beta; param.beta = beta;
param.k = k; param.k = k;
......
...@@ -232,7 +232,7 @@ void MatMulCompute::Run() { ...@@ -232,7 +232,7 @@ void MatMulCompute::Run() {
int ldc = n_; int ldc = n_;
if (n_ == 1) { if (n_ == 1) {
lite::arm::math::sgemv( lite::arm::math::sgemv(
x_data, y_data, o_data, false, m_, k_, false, nullptr, false); x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
if (fabsf(alpha - 1.f) > 1e-8f) { if (fabsf(alpha - 1.f) > 1e-8f) {
for (size_t i = 0; i < param.Out->dims().production(); ++i) { for (size_t i = 0; i < param.Out->dims().production(); ++i) {
o_data[i] *= alpha; o_data[i] *= alpha;
......
...@@ -48,14 +48,13 @@ void MulCompute::Run() { ...@@ -48,14 +48,13 @@ void MulCompute::Run() {
CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h"; CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
k_ = x_w; k_ = x_w;
auto& ctx = this->ctx_->template As<ARMContext>();
if (n_ == 1) { if (n_ == 1) {
lite::arm::math::sgemv( lite::arm::math::sgemv(
x_data, y_data, o_data, false, m_, k_, false, nullptr, false); x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
} else { } else {
constexpr bool is_tranposed_y = false; constexpr bool is_tranposed_y = false;
auto& ctx = this->ctx_->template As<ARMContext>();
int hblock = lite::arm::math::get_hblock(&ctx); int hblock = lite::arm::math::get_hblock(&ctx);
int m_round = hblock * ((m_ + hblock - 1) / hblock); int m_round = hblock * ((m_ + hblock - 1) / hblock);
ctx.ExtendWorkspace(m_round * k_ * sizeof(float)); ctx.ExtendWorkspace(m_round * k_ * sizeof(float));
......
...@@ -66,7 +66,6 @@ void PoolCompute::Run() { ...@@ -66,7 +66,6 @@ void PoolCompute::Run() {
in_dims[1], in_dims[1],
in_dims[2], in_dims[2],
in_dims[3]); in_dims[3]);
VLOG(3) << "invoking pooling_global_max";
return; return;
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
lite::arm::math::pooling_global_avg(din, lite::arm::math::pooling_global_avg(din,
...@@ -78,7 +77,6 @@ void PoolCompute::Run() { ...@@ -78,7 +77,6 @@ void PoolCompute::Run() {
in_dims[1], in_dims[1],
in_dims[2], in_dims[2],
in_dims[3]); in_dims[3]);
VLOG(3) << "invoking pooling_global_ave";
return; return;
} }
} else { } else {
...@@ -93,7 +91,6 @@ void PoolCompute::Run() { ...@@ -93,7 +91,6 @@ void PoolCompute::Run() {
in_dims[1], in_dims[1],
in_dims[2], in_dims[2],
in_dims[3]); in_dims[3]);
VLOG(3) << "invoking pooling2x2s2_max";
return; return;
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
lite::arm::math::pooling2x2s2_avg(din, lite::arm::math::pooling2x2s2_avg(din,
...@@ -106,7 +103,6 @@ void PoolCompute::Run() { ...@@ -106,7 +103,6 @@ void PoolCompute::Run() {
in_dims[2], in_dims[2],
in_dims[3], in_dims[3],
exclusive); exclusive);
VLOG(3) << "invoking pooling2x2s2_avg";
return; return;
} }
} else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 && } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
...@@ -121,7 +117,6 @@ void PoolCompute::Run() { ...@@ -121,7 +117,6 @@ void PoolCompute::Run() {
in_dims[1], in_dims[1],
in_dims[2], in_dims[2],
in_dims[3]); in_dims[3]);
VLOG(3) << "invokingpooling3x3s1p1_max";
return; return;
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
lite::arm::math::pooling3x3s1p1_avg(din, lite::arm::math::pooling3x3s1p1_avg(din,
...@@ -134,7 +129,32 @@ void PoolCompute::Run() { ...@@ -134,7 +129,32 @@ void PoolCompute::Run() {
in_dims[2], in_dims[2],
in_dims[3], in_dims[3],
exclusive); exclusive);
VLOG(3) << "invoking pooling3x3s1p1_avg"; return;
}
} else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
kps_equal) {
if (pooling_type == "max") {
lite::arm::math::pooling3x3s1p0_max(din,
dout,
out_dims[0],
out_dims[1],
out_dims[2],
out_dims[3],
in_dims[1],
in_dims[2],
in_dims[3]);
return;
} else if (pooling_type == "avg") {
lite::arm::math::pooling3x3s1p0_avg(din,
dout,
out_dims[0],
out_dims[1],
out_dims[2],
out_dims[3],
in_dims[1],
in_dims[2],
in_dims[3],
exclusive);
return; return;
} }
} else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 && } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
...@@ -149,7 +169,6 @@ void PoolCompute::Run() { ...@@ -149,7 +169,6 @@ void PoolCompute::Run() {
in_dims[1], in_dims[1],
in_dims[2], in_dims[2],
in_dims[3]); in_dims[3]);
VLOG(3) << "pooling3x3s2p0_max";
return; return;
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
lite::arm::math::pooling3x3s2p0_avg(din, lite::arm::math::pooling3x3s2p0_avg(din,
...@@ -162,7 +181,6 @@ void PoolCompute::Run() { ...@@ -162,7 +181,6 @@ void PoolCompute::Run() {
in_dims[2], in_dims[2],
in_dims[3], in_dims[3],
exclusive); exclusive);
VLOG(3) << "invoking pooling3x3s2p0_avg";
return; return;
} }
} else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 && } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
...@@ -177,7 +195,6 @@ void PoolCompute::Run() { ...@@ -177,7 +195,6 @@ void PoolCompute::Run() {
in_dims[1], in_dims[1],
in_dims[2], in_dims[2],
in_dims[3]); in_dims[3]);
VLOG(3) << "invoking pooling3x3s2p1_max";
return; return;
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
lite::arm::math::pooling3x3s2p1_avg(din, lite::arm::math::pooling3x3s2p1_avg(din,
...@@ -190,7 +207,6 @@ void PoolCompute::Run() { ...@@ -190,7 +207,6 @@ void PoolCompute::Run() {
in_dims[2], in_dims[2],
in_dims[3], in_dims[3],
exclusive); exclusive);
VLOG(3) << "invoking pooling3x3s2p1_avg";
return; return;
} }
} }
...@@ -213,7 +229,6 @@ void PoolCompute::Run() { ...@@ -213,7 +229,6 @@ void PoolCompute::Run() {
ceil_mode, ceil_mode,
use_quantizer, use_quantizer,
pooling_type); pooling_type);
VLOG(3) << "invoking pooling_basic";
} }
} // namespace arm } // namespace arm
......
...@@ -68,8 +68,8 @@ void pool_compute_ref(const operators::PoolParam& param) { ...@@ -68,8 +68,8 @@ void pool_compute_ref(const operators::PoolParam& param) {
auto& in_dims = param.x->dims(); auto& in_dims = param.x->dims();
auto& out_dims = param.output->dims(); auto& out_dims = param.output->dims();
const float* src_ptr = param.x->data<const float>(); const float* din = param.x->data<const float>();
float* dst_ptr = param.output->mutable_data<float>(); float* dout = param.output->mutable_data<float>();
std::vector<int> ksize = param.ksize; std::vector<int> ksize = param.ksize;
std::vector<int> strides = param.strides; std::vector<int> strides = param.strides;
...@@ -83,84 +83,120 @@ void pool_compute_ref(const operators::PoolParam& param) { ...@@ -83,84 +83,120 @@ void pool_compute_ref(const operators::PoolParam& param) {
bool use_quantizer = param.use_quantizer; bool use_quantizer = param.use_quantizer;
std::string data_format = param.data_format; std::string data_format = param.data_format;
int in_n = in_dims[0]; int num = in_dims[0];
int in_c = in_dims[1]; int chin = in_dims[1];
int in_h = in_dims[2]; int hin = in_dims[2];
int in_w = in_dims[3]; int win = in_dims[3];
int size_in_n = in_c * in_h * in_w;
int size_in_c = in_h * in_w;
int out_h = out_dims[2]; int chout = out_dims[1];
int out_w = out_dims[3]; int hout = out_dims[2];
int size_out_n = in_c * out_h * out_w; int wout = out_dims[3];
int size_out_c = out_h * out_w;
int window_h = ksize[0]; // no need to pad input tensor, border is zero pad inside this function
int window_w = ksize[1]; memset(dout, 0, num * chout * hout * wout * sizeof(float));
int kernel_h = ksize[0];
int kernel_w = ksize[1];
int stride_h = strides[0]; int stride_h = strides[0];
int stride_w = strides[1]; int stride_w = strides[1];
int pad_h = paddings[0]; int pad_h = paddings[0];
int pad_w = paddings[1]; int pad_w = paddings[1];
int size_channel_in = win * hin;
if (global_pooling == true) { int size_channel_out = wout * hout;
for (int n = 0; n < in_n; ++n) { if (global_pooling) {
for (int c = 0; c < in_c; ++c) { if (pooling_type == "max") { // Pooling_max
const float* src = src_ptr + n * size_in_n + c * size_in_c; for (int n = 0; n < num; ++n) {
float res = src[0]; float* dout_batch = dout + n * chout * size_channel_out;
if (pooling_type == "max") { const float* din_batch = din + n * chin * size_channel_in;
for (int i = 1; i < size_in_c; ++i) { #pragma omp parallel for
float cur_val = src[i]; for (int c = 0; c < chout; ++c) {
res = cur_val > res ? cur_val : res; const float* din_ch = din_batch + c * size_channel_in; // in address
float tmp1 = din_ch[0];
for (int i = 0; i < size_channel_in; ++i) {
float tmp2 = din_ch[i];
tmp1 = tmp1 > tmp2 ? tmp1 : tmp2;
} }
} else if (pooling_type == "avg") { dout_batch[c] = tmp1;
for (int i = 1; i < size_in_c; ++i) { }
float cur_val = src[i]; }
res += cur_val; } else if (pooling_type == "avg") {
// Pooling_average_include_padding
// Pooling_average_exclude_padding
for (int n = 0; n < num; ++n) {
float* dout_batch = dout + n * chout * size_channel_out;
const float* din_batch = din + n * chin * size_channel_in;
#pragma omp parallel for
for (int c = 0; c < chout; ++c) {
const float* din_ch = din_batch + c * size_channel_in; // in address
float sum = 0.f;
for (int i = 0; i < size_channel_in; ++i) {
sum += din_ch[i];
} }
res /= size_in_c; dout_batch[c] = sum / size_channel_in;
} }
dst_ptr[n * size_out_n + c] = res;
} }
} else {
LOG(FATAL) << "unsupported pooling type: " << pooling_type;
} }
} else { } else {
for (int n = 0; n < in_n; ++n) { for (int ind_n = 0; ind_n < num; ++ind_n) {
for (int c = 0; c < in_c; ++c) { #pragma omp parallel for
for (int h = 0; h < out_h; ++h) { for (int ind_c = 0; ind_c < chin; ++ind_c) {
int sh = h * stride_h; for (int ind_h = 0; ind_h < hout; ++ind_h) {
int eh = sh + window_h; int sh = ind_h * stride_h;
int eh = sh + kernel_h;
sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
eh = (eh - pad_h) > in_h ? in_h : eh - pad_h; eh = (eh - pad_h) > hin ? hin : eh - pad_h;
for (int w = 0; w < out_w; ++w) { for (int ind_w = 0; ind_w < wout; ++ind_w) {
int sw = w * stride_w; int sw = ind_w * stride_w;
int ew = sw + window_w; int ew = sw + kernel_w;
sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
ew = (ew - pad_w) > in_w ? in_w : ew - pad_w; ew = (ew - pad_w) > win ? win : ew - pad_w;
int pooling_size = (ew - sw) * (eh - sh); float result = static_cast<float>(0);
if (pooling_size == 0) continue; int dst_ind = (ind_n * chout + ind_c) * size_channel_out +
float res = 0.f; ind_h * wout + ind_w;
for (int kh = sh; kh < eh; ++kh) { for (int kh = sh; kh < eh; ++kh) {
for (int kw = sw; kw < ew; ++kw) { for (int kw = sw; kw < ew; ++kw) {
int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw; int src_ind =
(ind_n * chin + ind_c) * size_channel_in + kh * win + kw;
if (kh == sh && kw == sw) { if (kh == sh && kw == sw) {
res = src_ptr[src_idx]; result = din[src_ind];
} else { } else {
if (pooling_type == "max") { if (pooling_type == "max") {
res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx]; result = result >= din[src_ind] ? result : din[src_ind];
} } else if (pooling_type == "avg") {
if (pooling_type == "avg") { result += din[src_ind];
res += src_ptr[src_idx];
} }
} }
} }
} }
if (pooling_type == "avg") { if (pooling_type == "avg") {
if (exclusive) { if (exclusive) {
res /= pooling_size; int div = (ew - sw) * (eh - sh);
div = div > 0 ? div : 1;
result /= div;
} else { } else {
res /= window_h * window_w; int bh = kernel_h;
int bw = kernel_w;
if (ew == win) {
bw = sw + kernel_w >= win + pad_w ? win + pad_w
: sw + kernel_w;
bw -= sw;
if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
bw += pad_w;
}
}
if (eh == hin) {
bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
: sh + kernel_h;
bh -= sh;
if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
bh += pad_h;
}
}
result /= bh * bw;
} }
} }
dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res; dout[dst_ind] = result;
} }
} }
} }
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/arm/slice_compute.h" #include "lite/kernels/arm/slice_compute.h"
#include <algorithm>
#include <vector> #include <vector>
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
...@@ -20,22 +21,145 @@ namespace lite { ...@@ -20,22 +21,145 @@ namespace lite {
namespace kernels { namespace kernels {
namespace arm { namespace arm {
inline std::vector<int32_t> get_new_data_from_tensorlist(
const std::vector<lite::Tensor*>& list_new_data_tensor) {
// get tensor
std::vector<int32_t> vec_new_data;
for (size_t i = 0; i < list_new_data_tensor.size(); ++i) {
auto tensor = list_new_data_tensor[i];
CHECK_EQ(tensor->dims(), DDim({1})) << "shape of dim tensor should be [1]";
vec_new_data.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
}
return vec_new_data;
}
inline std::vector<int32_t> get_new_data_from_tensor(
const lite::Tensor* new_data_tensor) {
std::vector<int32_t> vec_new_data;
auto* new_data = new_data_tensor->data<int32_t>();
vec_new_data =
std::vector<int32_t>(new_data, new_data + new_data_tensor->numel());
return vec_new_data;
}
void SliceCompute::PrepareForRun() {} void SliceCompute::PrepareForRun() {}
void SliceCompute::Run() { void SliceCompute::Run() {
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
auto& param = this->Param<operators::SliceParam>(); auto& param = this->Param<operators::SliceParam>();
auto input_dims = param.X->dims(); auto in = param.X;
int dim_size = param.X->dims().size(); auto in_dims = in->dims();
auto out = param.Out;
auto out_dims = out->dims();
std::vector<int> starts = param.starts;
std::vector<int> ends = param.ends;
std::vector<int> axes = param.axes; std::vector<int> axes = param.axes;
const auto* x_data = param.X->data<int>(); std::vector<int32_t> starts = param.starts;
auto* o_data = param.Out->mutable_data<int>(); std::vector<int32_t> ends = param.ends;
std::vector<int> decrease_axis = param.decrease_axis;
std::vector<int> infer_flags = param.infer_flags;
auto list_new_ends_tensor = param.EndsTensorList;
auto list_new_starts_tensor = param.StartsTensorList;
bool need_infer = false;
if (param.StartsTensor || param.EndsTensor) {
need_infer = true;
}
if (list_new_starts_tensor.size() > 0 || list_new_ends_tensor.size() > 0) {
need_infer = true;
}
if (need_infer) {
if (param.StartsTensor) {
starts = get_new_data_from_tensor(param.StartsTensor);
} else if (list_new_starts_tensor.size() > 0) {
starts = get_new_data_from_tensorlist(list_new_starts_tensor);
}
CHECK_EQ(starts.size(), axes.size())
<< "The size of starts must be equal to the size of axes.";
if (param.EndsTensor) {
ends = get_new_data_from_tensor(param.EndsTensor);
} else if (list_new_ends_tensor.size() > 0) {
ends = get_new_data_from_tensorlist(list_new_ends_tensor);
}
CHECK_EQ(ends.size(), axes.size())
<< "The size of ends must be equal to the size of axes.";
out_dims = in_dims;
int dim_value, start, end;
for (size_t i = 0; i < axes.size(); ++i) {
dim_value = out_dims[axes[i]];
if (dim_value > 0) {
// when end = start+1 and start == -1
if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
auto ret =
std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
if (ret != decrease_axis.end()) {
ends[i] = 10000000;
}
}
start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
start = std::max(start, 0);
end = std::max(end, 0);
end = std::min(end, dim_value);
CHECK_GT(end, start) << "end should greater than start";
out_dims[axes[i]] = end - start;
}
}
out->Resize(out_dims);
// generate new shape
if (decrease_axis.size() > 0) {
std::vector<int64_t> new_out_shape;
for (size_t i = 0; i < decrease_axis.size(); ++i) {
CHECK_EQ(out_dims[decrease_axis[i]], 1) << "decrease dim should be 1";
out_dims[decrease_axis[i]] = 0;
}
for (int i = 0; i < out_dims.size(); ++i) {
if (out_dims[i] != 0) {
new_out_shape.push_back(out_dims[i]);
}
}
if (new_out_shape.size() == 0) {
new_out_shape.push_back(1);
}
DDim new_dims;
new_dims.ConstructFrom(new_out_shape);
out_dims = new_dims;
}
}
// resize out dims
if (decrease_axis.size() > 0) {
if (decrease_axis.size() == (size_t)in_dims.size()) {
std::vector<int64_t> vec_origin_out_shape(decrease_axis.size(), 1);
out->Resize(DDim(vec_origin_out_shape));
} else {
std::vector<int64_t> vec_origin_out_shape(
out_dims.size() + decrease_axis.size(), -1);
for (size_t i = 0; i < decrease_axis.size(); ++i) {
vec_origin_out_shape[decrease_axis[i]] = 1;
}
int index = 0;
for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) {
if (vec_origin_out_shape[i] == -1) {
vec_origin_out_shape[i] = out_dims[index];
++index;
}
}
out->Resize(DDim(vec_origin_out_shape));
}
}
auto new_out_dims = out->dims();
const auto* x_data = in->data<int>();
auto* o_data = out->mutable_data<int>();
lite::arm::math::slice( lite::arm::math::slice(
x_data, input_dims.data(), axes, starts, ends, o_data, &ctx); x_data, in_dims.data(), axes, starts, ends, o_data, &ctx);
} }
} // namespace arm } // namespace arm
...@@ -46,12 +170,9 @@ void SliceCompute::Run() { ...@@ -46,12 +170,9 @@ void SliceCompute::Run() {
REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL(
slice, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SliceCompute, def) slice, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SliceCompute, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("StartsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("EndsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("StartsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("EndsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
// REGISTER_LITE_KERNEL(
// slice, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SliceCompute, def)
// .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), Precision(kINT32))})
// .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM),
// Precision(kINT32))})
// .Finalize();
...@@ -55,6 +55,10 @@ REGISTER_LITE_KERNEL(unsqueeze, ...@@ -55,6 +55,10 @@ REGISTER_LITE_KERNEL(unsqueeze,
paddle::lite::kernels::host::UnsqueezeCompute, paddle::lite::kernels::host::UnsqueezeCompute,
def) def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("AxesTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("AxesTensorList",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
...@@ -65,6 +69,10 @@ REGISTER_LITE_KERNEL(unsqueeze2, ...@@ -65,6 +69,10 @@ REGISTER_LITE_KERNEL(unsqueeze2,
paddle::lite::kernels::host::Unsqueeze2Compute, paddle::lite::kernels::host::Unsqueeze2Compute,
def) def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("AxesTensor",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("AxesTensorList",
{LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
...@@ -9,6 +9,7 @@ add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_k ...@@ -9,6 +9,7 @@ add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_k
add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps}) add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps}) add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps}) add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
add_kernel(transpose_compute_cuda CUDA basic SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda} cuda_transpose) add_kernel(transpose_compute_cuda CUDA basic SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda} cuda_transpose)
add_kernel(nearest_interp_compute_cuda CUDA basic SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps}) add_kernel(nearest_interp_compute_cuda CUDA basic SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps})
add_kernel(conv2d_cuda CUDA basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(conv2d_cuda CUDA basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
...@@ -22,6 +23,16 @@ add_kernel(dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS ${lite_k ...@@ -22,6 +23,16 @@ add_kernel(dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS ${lite_k
add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps}) add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps})
add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS ${lite_kernel_deps}) add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS ${lite_kernel_deps})
add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS ${lite_kernel_deps}) add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS ${lite_kernel_deps})
add_kernel(search_seq_depadding_compute_cuda CUDA basic SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_reverse_compute_cuda CUDA basic SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_concat_compute_cuda CUDA basic SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_arithmetic_compute_cuda CUDA basic SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps})
add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps})
add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps})
add_kernel(match_matrix_tensor_compute_cuda CUDA basic SRCS match_matrix_tensor_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} cuda_batched_gemm)
add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
add_kernel(var_conv_2d_compute_cuda CUDA basic SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda) lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda)
nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda) nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
...@@ -32,8 +43,22 @@ nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_c ...@@ -32,8 +43,22 @@ nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_c
nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda) nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda) nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda)
nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda) nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda)
nv_test(sequence_pool_compute_cuda_test SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_cuda)
nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda) nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda)
#nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda) #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda ) nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda) nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)
nv_test(search_seq_depadding_compute_cuda_test SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_cuda)
nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda)
nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
nv_test(match_matrix_tensor_compute_cuda_test SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_cuda)
nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
if(LITE_BUILD_EXTRA)
nv_test(lookup_table_compute_cuda_test SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_cuda)
nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda)
nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda)
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/target_wrapper.h"
#include "lite/kernels/cuda/attention_padding_mask_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
#define CUDA_NUM_THREADS 256
inline int CUDA_GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void ker_attention_padding_mask(T* out_data,
const T* attn_data,
const int* src_offset,
const int attn_seq_num,
const int attn_seq_len,
const int src_seq_num,
const int src_seq_len,
const T mask,
const int count) {
CUDA_KERNEL_LOOP(tid, count) {
int src_word_id = tid % src_seq_len;
int tmp_tid = tid / src_seq_len;
int attn_seq_id = tmp_tid / attn_seq_len;
int attn_word_id = tmp_tid % attn_seq_len;
int src_seq_id = attn_seq_id % src_seq_num;
int cur_len = src_offset[src_seq_id + 1] - src_offset[src_seq_id];
if (src_word_id >= cur_len) {
out_data[tid] = mask;
} else {
out_data[tid] = attn_data[tid];
}
}
}
void AttentionPaddingMaskCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
auto attn = param.X;
auto src = param.Y;
const int count = attn->numel();
auto attn_offset = attn->lod()[0];
auto src_offset = src->lod()[0];
const int attn_seq_num = attn_offset.size() - 1;
const int attn_seq_len = attn_offset[1];
const int src_seq_num = src_offset.size() - 1;
const int src_seq_len = count / attn->dims()[0];
auto out = param.Out;
out->Resize(attn->dims());
out->set_lod(attn->lod());
auto attn_data = attn->data<float>();
auto out_data = out->mutable_data<float>(TARGET(kCUDA));
std::vector<int> src_offset_cpu(src_offset.size(), 0);
for (int i = 0; i < src_offset.size(); i++) {
src_offset_cpu[i] = src_offset[i];
}
src_offset_cuda.Resize({static_cast<int64_t>(src_offset.size())});
auto src_offset_cuda_data = src_offset_cuda.mutable_data<int>(TARGET(kCUDA));
TargetWrapperCuda::MemcpyAsync(src_offset_cuda_data,
src_offset_cpu.data(),
sizeof(int) * src_offset.size(),
IoDirection::HtoD,
stream);
ker_attention_padding_mask<
float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
out_data,
attn_data,
src_offset_cuda_data,
attn_seq_num,
attn_seq_len,
src_seq_num,
src_seq_len,
param.mask,
count);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(attention_padding_mask,
kCUDA,
kFloat,
kNCHW,
paddle::lite::kernels::cuda::AttentionPaddingMaskCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
class AttentionPaddingMaskCompute
: public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public:
using param_t = operators::AttentionPaddingMaskParam;
void Run() override;
virtual ~AttentionPaddingMaskCompute() = default;
private:
lite::Tensor src_offset_cuda;
};
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/cuda/attention_padding_mask_compute.h"
#include <gtest/gtest.h>
#include <iostream>
#include <memory>
#include <utility>
#include <vector>
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
void attention_padding_mask_ref(
const Tensor& x,
const Tensor& y,
Tensor* out,
Tensor* pad_begin,
const operators::AttentionPaddingMaskParam& param) {
auto attn_offset = x.lod()[0];
auto src_offset = y.lod()[0];
int attn_seq_num = attn_offset.size() - 1;
int src_seq_num = src_offset.size() - 1;
int attn_seq_len = attn_offset[1];
int src_seq_len = x.dims()[1];
CHECK_EQ(attn_seq_num % src_seq_num, 0);
auto count = x.numel();
auto attn_data = x.data<float>();
out->Resize(x.dims());
out->set_lod(x.lod());
auto out_data = out->mutable_data<float>();
memcpy(out_data, attn_data, count * sizeof(float));
for (int i = 0; i < attn_seq_num; ++i) {
for (int j = 0; j < attn_seq_len; ++j) {
auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j);
int src_seq_idx = i % src_seq_num;
int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx];
for (int k = cur_len; k < src_seq_len; k++) {
tmp_out_data[k] = param.mask;
}
}
}
}
void prepare_input(Tensor* x, const LoD& lod, int64_t dim2rd) {
std::vector<int64_t> x_dims{static_cast<int64_t>(lod[0].back()), dim2rd};
x->Resize(x_dims);
x->set_lod(lod);
auto x_data = x->mutable_data<float>();
auto x_num = x->numel();
for (int i = 0; i < x_num; i++) {
x_data[i] = (i - x_num) * 1.1;
}
}
int get_max_len(const LoD& lod) {
int max_len = 0;
auto offset = lod[0];
for (int i = 0; i < offset.size() - 1; i++) {
int cur_len = offset[i + 1] - offset[i];
max_len = max_len < cur_len ? cur_len : max_len;
}
return max_len;
}
TEST(attention_padding_mask_cuda, run_test) {
lite::Tensor x, y, x_cpu, y_cpu;
lite::Tensor out, pad_begin, out_cpu, out_ref, pad_begin_ref;
LoD x_lod{{0, 3, 6, 9, 12}}, y_lod{{0, 4, 6}};
prepare_input(&x_cpu, x_lod, get_max_len(y_lod));
prepare_input(&y_cpu, y_lod, 1);
x.Resize(x_cpu.dims());
x.set_lod(x_cpu.lod());
auto x_cpu_data = x_cpu.mutable_data<float>();
x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
y.Resize(y_cpu.dims());
y.set_lod(y_cpu.lod());
operators::AttentionPaddingMaskParam param;
param.X = &x;
param.Y = &y;
param.pad_id = 12800001;
param.mask = -90000000.f;
param.Out = &out;
param.pad_begin = &pad_begin;
std::unique_ptr<KernelContext> ctx(new KernelContext);
auto context = ctx->As<CUDAContext>();
cudaStream_t stream;
cudaStreamCreate(&stream);
context.SetExecStream(stream);
AttentionPaddingMaskCompute attention_padding_mask_kernel;
attention_padding_mask_kernel.SetParam(param);
attention_padding_mask_kernel.SetContext(std::move(ctx));
attention_padding_mask_kernel.Run();
cudaDeviceSynchronize();
auto out_data = out.mutable_data<float>(TARGET(kCUDA));
out_cpu.Resize(out.dims());
auto out_cpu_data = out_cpu.mutable_data<float>();
CopySync<TARGET(kCUDA)>(
out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
attention_padding_mask_ref(x_cpu, y_cpu, &out_ref, &pad_begin_ref, param);
auto out_ref_data = out_ref.data<float>();
for (int i = 0; i < out.numel(); i++) {
EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -11,6 +11,7 @@ limitations under the License. */ ...@@ -11,6 +11,7 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "lite/backends/cuda/target_wrapper.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/kernels/cuda/bilinear_interp_compute.h" #include "lite/kernels/cuda/bilinear_interp_compute.h"
...@@ -20,6 +21,43 @@ namespace kernels { ...@@ -20,6 +21,43 @@ namespace kernels {
namespace cuda { namespace cuda {
using Tensor = lite::Tensor; using Tensor = lite::Tensor;
inline std::vector<int> get_new_shape(
std::vector<const lite::Tensor*> list_new_shape_tensor) {
// get tensor from
std::vector<int> vec_new_shape;
for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
auto tensor = list_new_shape_tensor[i];
lite::Tensor temp;
auto temp_data = temp.mutable_data<float>();
auto tensor_data = tensor->data<float>();
cudaMemcpy(temp_data,
tensor_data,
tensor->dims().production() * sizeof(float),
cudaMemcpyDeviceToHost);
vec_new_shape.push_back(static_cast<int32_t>(*temp_data));
}
return vec_new_shape;
}
template <typename T>
inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
std::vector<T> vec_new_data;
auto* new_data = new_data_tensor->data<T>();
lite::Tensor cpu_starts_tensor;
auto cpu_starts_tensor_data = cpu_starts_tensor.mutable_data<T>();
cudaMemcpy(cpu_starts_tensor_data,
new_data,
new_data_tensor->dims().production() * sizeof(T),
cudaMemcpyDeviceToHost);
auto new_data_ = cpu_starts_tensor.data<T>();
vec_new_data = std::vector<T>(
new_data_, new_data_ + new_data_tensor->dims().production());
return vec_new_data;
}
template <typename T> template <typename T>
__global__ void BilinearInterp(const T* in, __global__ void BilinearInterp(const T* in,
const size_t in_img_h, const size_t in_img_h,
...@@ -103,23 +141,35 @@ void BilinearInterpCompute::Run() { ...@@ -103,23 +141,35 @@ void BilinearInterpCompute::Run() {
int out_w = param.out_w; int out_w = param.out_w;
float scale = param.scale; float scale = param.scale;
bool align_corners = param.align_corners; bool align_corners = param.align_corners;
if (scale > 0) {
out_h = static_cast<int>(in_h * scale);
out_w = static_cast<int>(in_w * scale);
}
if (out_size != nullptr) { auto list_new_shape_tensor = param.SizeTensor;
Tensor sizes; if (list_new_shape_tensor.size() > 0) {
float* size_data = sizes.mutable_data<float>(); // have size tensor
float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA)); auto new_size = get_new_shape(list_new_shape_tensor);
cudaMemcpy( out_h = new_size[0];
size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost); out_w = new_size[1];
out_h = static_cast<int>(size_data[0]); } else {
out_w = static_cast<int>(size_data[1]); auto scale_tensor = param.Scale;
if (scale_tensor != nullptr) {
auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
scale = scale_data[0];
}
if (scale > 0) {
out_h = static_cast<int>(in_h * scale);
out_w = static_cast<int>(in_w * scale);
}
if (out_size != nullptr) {
lite::Tensor sizes;
float* size_data = sizes.mutable_data<float>();
float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
cudaMemcpy(
size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
out_h = static_cast<int>(size_data[0]);
out_w = static_cast<int>(size_data[1]);
}
} }
auto output_data = output->mutable_data<float>(TARGET(kCUDA)); auto output_data = output->mutable_data<float>(TARGET(kCUDA));
if (in_h == out_h && in_w == out_w) { if (in_h == out_h && in_w == out_w) {
cudaMemcpy(output_data, cudaMemcpy(output_data,
input_data, input_data,
...@@ -188,6 +238,14 @@ REGISTER_LITE_KERNEL(bilinear_interp, ...@@ -188,6 +238,14 @@ REGISTER_LITE_KERNEL(bilinear_interp,
{LiteType::GetTensorTy(TARGET(kCUDA), {LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNCHW))}) DATALAYOUT(kNCHW))})
.BindInput("SizeTensor",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindInput("Scale",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat),
DATALAYOUT(kNCHW))})
.BindOutput("Out", .BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kCUDA), {LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat), PRECISION(kFloat),
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <memory> #include <memory>
#include <utility> #include <utility>
#include <vector>
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -98,6 +99,116 @@ TEST(bilinear_interp, normal) { ...@@ -98,6 +99,116 @@ TEST(bilinear_interp, normal) {
} }
} }
TEST(bilinear_interp, update) {
BilinearInterpCompute bilinear_interp_kernel;
std::unique_ptr<KernelContext> ctx(new KernelContext);
auto& context = ctx->As<CUDAContext>();
operators::InterpolateParam param;
std::vector<Tensor> size_tensor(2);
std::vector<Tensor> size_tensor_cpu(2), size_tensor_ref(2);
Tensor x, input_scale, osz, out;
Tensor x_cpu, input_scale_cpu, osz_cpu, out_cpu;
Tensor x_ref, input_scale_ref, osz_ref, out_ref;
int n = 1, c = 1, in_h = 3, in_w = 3;
int out_h = 6, out_w = 6;
float scale = 2.0;
param.out_h = out_h;
param.out_w = out_w;
param.scale = scale;
param.align_corners = false;
param.align_mode = 0;
x.Resize({n, c, in_h, in_w});
size_tensor[0].Resize({1});
size_tensor[1].Resize({1});
input_scale.Resize({1});
osz.Resize({2});
out.Resize({n, c, out_h, out_w});
x_cpu.Resize({n, c, in_h, in_w});
size_tensor_cpu[0].Resize({1});
size_tensor_cpu[1].Resize({1});
input_scale_cpu.Resize({1});
osz_cpu.Resize({2});
out_cpu.Resize({n, c, out_h, out_w});
x_ref.Resize({n, c, in_h, in_w});
size_tensor_ref[0].Resize({1});
size_tensor_ref[1].Resize({1});
input_scale_ref.Resize({1});
osz_ref.Resize({2});
out_ref.Resize({n, c, out_h, out_w});
auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
float* x_cpu_data = x_cpu.mutable_data<float>();
float* size_tensor0_cpu_data = size_tensor_cpu[0].mutable_data<float>();
float* size_tensor1_cpu_data = size_tensor_cpu[1].mutable_data<float>();
float* input_scale_cpu_data = input_scale_cpu.mutable_data<float>();
float* osz_cpu_data = osz_cpu.mutable_data<float>();
float* out_cpu_data = out_cpu.mutable_data<float>();
float* x_ref_data = x_ref.mutable_data<float>();
float* size_tensor0_ref_data = size_tensor_ref[0].mutable_data<float>();
float* size_tensor1_ref_data = size_tensor_ref[1].mutable_data<float>();
float* input_scale_ref_data = input_scale_ref.mutable_data<float>();
float* osz_ref_data = osz_ref.mutable_data<float>();
for (int i = 0; i < x_cpu.numel(); ++i) {
x_cpu_data[i] = i + 5.0;
x_ref_data[i] = i + 5.0;
}
osz_cpu_data[0] = out_h;
osz_cpu_data[1] = out_w;
size_tensor0_cpu_data[0] = out_h;
size_tensor1_cpu_data[0] = out_w;
input_scale_cpu_data[0] = scale;
osz_ref_data[0] = out_h;
osz_ref_data[1] = out_w;
size_tensor0_ref_data[0] = out_h;
size_tensor1_ref_data[0] = out_w;
input_scale_ref_data[0] = scale;
x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
size_tensor[0].Assign<float, lite::DDim, TARGET(kCUDA)>(
size_tensor0_cpu_data, size_tensor[0].dims());
size_tensor[1].Assign<float, lite::DDim, TARGET(kCUDA)>(
size_tensor1_cpu_data, size_tensor[1].dims());
input_scale.Assign<float, lite::DDim, TARGET(kCUDA)>(input_scale_cpu_data,
input_scale.dims());
osz.Assign<float, lite::DDim, TARGET(kCUDA)>(osz_cpu_data, osz_cpu.dims());
param.X = &x;
param.SizeTensor.emplace_back(
reinterpret_cast<const Tensor*>(&size_tensor[0]));
param.SizeTensor.emplace_back(
reinterpret_cast<const Tensor*>(&size_tensor[1]));
param.Scale = &input_scale;
param.OutSize = &osz;
param.Out = &out;
bilinear_interp_kernel.SetParam(param);
cudaStream_t stream;
cudaStreamCreate(&stream);
context.SetExecStream(stream);
bilinear_interp_kernel.SetContext(std::move(ctx));
bilinear_interp_kernel.Launch();
cudaDeviceSynchronize();
CopySync<TARGET(kCUDA)>(
out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
for (int i = 0; i < out.numel(); i++) {
LOG(INFO) << out_cpu_data[i];
}
}
} // namespace cuda } // namespace cuda
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -51,6 +51,11 @@ void ConcatCompute<Dtype>::Run() { ...@@ -51,6 +51,11 @@ void ConcatCompute<Dtype>::Run() {
Tensor* output = param.output; Tensor* output = param.output;
auto* output_data = output->mutable_data<Dtype>(TARGET(kCUDA)); auto* output_data = output->mutable_data<Dtype>(TARGET(kCUDA));
int axis = param.axis; int axis = param.axis;
Tensor* axis_tensor = param.axis_tensor;
if (axis_tensor != nullptr) {
const int* axis_tensor_data = axis_tensor->data<int>();
axis = axis_tensor_data[0];
}
int inner_size = 1; int inner_size = 1;
int outer_size = 1; int outer_size = 1;
auto input_dims = input[0]->dims(); auto input_dims = input[0]->dims();
...@@ -97,5 +102,7 @@ REGISTER_LITE_KERNEL(concat, ...@@ -97,5 +102,7 @@ REGISTER_LITE_KERNEL(concat,
paddle::lite::kernels::cuda::ConcatCompute<float>, paddle::lite::kernels::cuda::ConcatCompute<float>,
def) def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize(); .Finalize();
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/kernels/cuda/lookup_table_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
using Tensor = lite::Tensor;
template <int BlockDimX, int BlockDimY, int GridDimX, bool PaddingFlag>
__global__ void LookupTableKernel(float *output,
const float *table,
const int64_t *ids,
const int64_t N,
const int64_t K,
const int64_t D,
const int64_t padding_idx) {
int idx = threadIdx.x;
int idy = blockIdx.x + threadIdx.y * GridDimX;
while (idy < K) {
int64_t id = ids[idy];
float *out = output + idy * D;
const float *tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) {
if (PaddingFlag) {
if (id == padding_idx)
out[i] = static_cast<float>(0);
else
out[i] = tab[i];
} else {
out[i] = tab[i];
}
}
idy += BlockDimY * GridDimX;
}
}
void LookupTableCompute::Run() {
auto &param = this->Param<param_t>();
auto &ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
Tensor *w_t = param.W;
Tensor *ids_t = param.Ids;
Tensor *out_t = param.Out;
int64_t padding_idx = param.padding_idx;
size_t N = w_t->dims()[0];
size_t D = w_t->dims()[1];
size_t K = ids_t->numel();
auto *w = w_t->data<float>();
auto *ids = ids_t->data<int64_t>();
auto *out = out_t->mutable_data<float>(TARGET(kCUDA));
dim3 threads(128, 8);
dim3 grids(8, 1);
if (padding_idx == -1) {
LookupTableKernel<128, 8, 8, false><<<grids, threads, 0, stream>>>(
out, w, ids, N, K, D, padding_idx);
} else {
LookupTableKernel<128, 8, 8, true><<<grids, threads, 0, stream>>>(
out, w, ids, N, K, D, padding_idx);
}
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(lookup_table,
kCUDA,
kFloat,
kNCHW,
paddle::lite::kernels::cuda::LookupTableCompute,
def)
.BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
.Finalize();
REGISTER_LITE_KERNEL(lookup_table_v2,
kCUDA,
kFloat,
kNCHW,
paddle::lite::kernels::cuda::LookupTableCompute,
def)
.BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
.BindInput("Ids", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
.Finalize();
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册