未验证 提交 b7771b58 编写于 作者: myq406450149's avatar myq406450149 提交者: GitHub

Merge pull request #6 from PaddlePaddle/develop

sync
...@@ -67,7 +67,7 @@ lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) ...@@ -67,7 +67,7 @@ lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF)
lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF) lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF)
lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF) lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF)
lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF) lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF)
lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF IF LITE_WITH_PROFILE) lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF) lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF)
lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF) lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF) lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
......
...@@ -152,9 +152,10 @@ endif() ...@@ -152,9 +152,10 @@ endif()
if (LITE_WITH_PROFILE) if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE") add_definitions("-DLITE_WITH_PROFILE")
if (LITE_WITH_PRECISION_PROFILE) endif()
if (LITE_WITH_PRECISION_PROFILE)
add_definitions("-DLITE_WITH_PRECISION_PROFILE") add_definitions("-DLITE_WITH_PRECISION_PROFILE")
endif()
endif() endif()
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
......
...@@ -307,6 +307,9 @@ function(add_kernel TARGET device level) ...@@ -307,6 +307,9 @@ function(add_kernel TARGET device level)
if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA)) if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
return() return()
endif() endif()
if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
return()
endif()
if ("${device}" STREQUAL "Host") if ("${device}" STREQUAL "Host")
...@@ -322,16 +325,11 @@ function(add_kernel TARGET device level) ...@@ -322,16 +325,11 @@ function(add_kernel TARGET device level)
set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "") set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "X86") if ("${device}" STREQUAL "X86")
if (NOT LITE_WITH_X86) if (NOT LITE_WITH_X86 OR LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS}) foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach() endforeach()
return() return()
elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS})
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif() endif()
set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "") set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
...@@ -434,11 +432,13 @@ function(add_operator TARGET level) ...@@ -434,11 +432,13 @@ function(add_operator TARGET level)
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA)) if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
return() return()
endif() endif()
if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
return()
endif()
foreach(src ${args_SRCS}) foreach(src ${args_SRCS})
if(LITE_BUILD_TAILOR) if(LITE_BUILD_TAILOR)
......
...@@ -108,8 +108,8 @@ if (LITE_WITH_PYTHON) ...@@ -108,8 +108,8 @@ if (LITE_WITH_PYTHON)
add_dependencies(publish_inference publish_inference_python_light_demo) add_dependencies(publish_inference publish_inference_python_light_demo)
endif() endif()
if (LITE_WITH_X86) if (LITE_WITH_CUDA OR LITE_WITH_X86)
add_custom_target(publish_inference_x86_cxx_lib ${TARGET} add_custom_target(publish_inference_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
...@@ -117,50 +117,44 @@ if (LITE_WITH_X86) ...@@ -117,50 +117,44 @@ if (LITE_WITH_X86)
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_custom_target(publish_inference_third_party ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
add_dependencies(publish_inference_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_cxx_lib)
add_dependencies(publish_inference publish_inference_third_party)
endif()
if (LITE_WITH_X86)
add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
) )
add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
add_dependencies(publish_inference_x86_cxx_lib test_model_bin) add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
add_dependencies(publish_inference_x86_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_x86_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_x86_cxx_lib)
add_custom_target(publish_inference_x86_cxx_demos ${TARGET} add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party" COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
) )
add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos) add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3) add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
add_dependencies(publish_inference publish_inference_x86_cxx_lib)
add_dependencies(publish_inference publish_inference_x86_cxx_demos)
endif() endif()
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
add_custom_target(publish_inference_cuda_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_cuda_cxx_lib)
add_custom_target(publish_inference_cuda_cxx_demos ${TARGET} add_custom_target(publish_inference_cuda_cxx_demos ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
) )
add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos)
add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared) add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared)
add_dependencies(publish_inference publish_inference_cuda_cxx_demos)
endif(LITE_WITH_CUDA) endif(LITE_WITH_CUDA)
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
# add cxx lib # add cxx lib
...@@ -192,7 +186,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -192,7 +186,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
add_dependencies(publish_inference publish_inference_cxx_lib) add_dependencies(publish_inference publish_inference_cxx_lib)
if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a) COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.so)
endif() endif()
endif() endif()
else() else()
......
...@@ -8,11 +8,12 @@ if (LITE_ON_TINY_PUBLISH) ...@@ -8,11 +8,12 @@ if (LITE_ON_TINY_PUBLISH)
set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG") set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG") set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
endif() endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) set(light_lib_DEPS light_api paddle_api paddle_api_light)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library #full api dynamic library
add_library(paddle_full_api_shared SHARED "") lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc) DEPS paddle_api paddle_api_light paddle_api_full)
add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto) add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
target_link_libraries(paddle_full_api_shared framework_proto) target_link_libraries(paddle_full_api_shared framework_proto)
if(LITE_WITH_X86) if(LITE_WITH_X86)
...@@ -27,13 +28,13 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE ...@@ -27,13 +28,13 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
endif(LITE_WITH_CUDA) endif(LITE_WITH_CUDA)
#light api dynamic library #light api dynamic library
lite_cc_library(paddle_light_api_shared MODULE lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc
SRCS light_api_shared.cc
DEPS ${light_lib_DEPS} DEPS ${light_lib_DEPS}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}) NPU_DEPS ${npu_kernels}
)
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map") set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}") set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
...@@ -262,7 +263,10 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -262,7 +263,10 @@ if (NOT LITE_ON_TINY_PUBLISH)
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}) FPGA_DEPS ${fpga_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
BM_DEPS ${bm_kernels})
# The final inference library for just MobileConfig. # The final inference library for just MobileConfig.
bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
target_link_libraries(paddle_api_full ${cuda_deps}) target_link_libraries(paddle_api_full ${cuda_deps})
...@@ -311,7 +315,7 @@ add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_ ...@@ -311,7 +315,7 @@ add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_
if (LITE_ON_MODEL_OPTIMIZE_TOOL) if (LITE_ON_MODEL_OPTIMIZE_TOOL)
message(STATUS "Compiling opt") message(STATUS "Compiling opt")
lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
DEPS gflags kernel op optimizer mir_passes utils) DEPS gflags kernel op optimizer mir_passes utils ${host_kernels})
add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h) add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
endif(LITE_ON_MODEL_OPTIMIZE_TOOL) endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
......
...@@ -44,7 +44,10 @@ DEFINE_string(input_shape, ...@@ -44,7 +44,10 @@ DEFINE_string(input_shape,
"set input shapes according to the model, " "set input shapes according to the model, "
"separated by colon and comma, " "separated by colon and comma, "
"such as 1,3,244,244"); "such as 1,3,244,244");
DEFINE_string(input_img_path, "", "the path of input image"); DEFINE_string(input_img_path,
"",
"the path of input image, if not set "
"input_img_path, the input of model will be 1.0.");
DEFINE_int32(warmup, 0, "warmup times"); DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times"); DEFINE_int32(repeats, 1, "repeats times");
DEFINE_int32(power_mode, DEFINE_int32(power_mode,
...@@ -57,16 +60,11 @@ DEFINE_int32(power_mode, ...@@ -57,16 +60,11 @@ DEFINE_int32(power_mode,
DEFINE_int32(threads, 1, "threads num"); DEFINE_int32(threads, 1, "threads num");
DEFINE_string(result_filename, DEFINE_string(result_filename,
"result.txt", "result.txt",
"save benchmark " "save the inference time to the file.");
"result to the file");
DEFINE_bool(run_model_optimize, DEFINE_bool(run_model_optimize,
false, false,
"if set true, apply model_optimize_tool to " "if set true, apply model_optimize_tool to "
"model and use optimized model to test. "); "model and use optimized model to test. ");
DEFINE_bool(is_quantized_model,
false,
"if set true, "
"test the performance of the quantized model. ");
namespace paddle { namespace paddle {
namespace lite_api { namespace lite_api {
...@@ -87,10 +85,6 @@ void OutputOptModel(const std::string& save_optimized_model_dir) { ...@@ -87,10 +85,6 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
std::vector<Place> vaild_places = { std::vector<Place> vaild_places = {
Place{TARGET(kARM), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)},
}; };
if (FLAGS_is_quantized_model) {
vaild_places.insert(vaild_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)});
}
config.set_valid_places(vaild_places); config.set_valid_places(vaild_places);
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
...@@ -181,8 +175,8 @@ void Run(const std::vector<int64_t>& input_shape, ...@@ -181,8 +175,8 @@ void Run(const std::vector<int64_t>& input_shape,
int main(int argc, char** argv) { int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true); gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "" || FLAGS_result_filename == "") { if (FLAGS_model_dir == "") {
LOG(INFO) << "please run ./benchmark_bin --help to obtain usage."; LOG(INFO) << "Please run ./benchmark_bin --help to obtain usage.";
exit(0); exit(0);
} }
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "lite/api/paddle_use_passes.h"
#include "lite/utils/io.h" #include "lite/utils/io.h"
namespace paddle { namespace paddle {
...@@ -295,6 +296,8 @@ void Predictor::Build(const cpp::ProgramDesc &desc, ...@@ -295,6 +296,8 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
inner_places.emplace_back( inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
// Analysis whether the modle is quantized.
// For quantized model, add place(arm, int8) to inner_places
const std::vector<std::string> quant_dequant_op = { const std::vector<std::string> quant_dequant_op = {
"fake_quantize_abs_max", "fake_quantize_abs_max",
"fake_quantize_range_abs_max", "fake_quantize_range_abs_max",
...@@ -317,7 +320,8 @@ void Predictor::Build(const cpp::ProgramDesc &desc, ...@@ -317,7 +320,8 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
} }
} }
if (is_quantized_model) { if (is_quantized_model) {
inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)}); inner_places.insert(inner_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)});
} }
Program program(desc, scope_, inner_places); Program program(desc, scope_, inner_places);
......
...@@ -31,10 +31,17 @@ namespace lite { ...@@ -31,10 +31,17 @@ namespace lite {
void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
config_ = config; config_ = config;
auto places = config.valid_places();
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
// if kCUDA is included in valid places, it should be initialized first,
// otherwise skip this step.
for (auto &p : places) {
if (p.target == TARGET(kCUDA)) {
Env<TARGET(kCUDA)>::Init(); Env<TARGET(kCUDA)>::Init();
break;
}
}
#endif #endif
auto places = config.valid_places();
std::vector<std::string> passes{}; std::vector<std::string> passes{};
auto use_layout_preprocess_pass = auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS"); config.model_dir().find("OPENCL_PRE_PRECESS");
......
...@@ -13,13 +13,9 @@ ...@@ -13,13 +13,9 @@
// limitations under the License. // limitations under the License.
#include "lite/api/light_api.h" #include "lite/api/light_api.h"
#include <algorithm>
#include "paddle_use_kernels.h" // NOLINT #include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT #include "paddle_use_ops.h" // NOLINT
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h"
#endif
#include <algorithm>
namespace paddle { namespace paddle {
namespace lite { namespace lite {
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/api/paddle_api.h"
namespace paddle {
namespace lite_api {
void RunModel() {
// 1. Set MobileConfig
MobileConfig mobile_config;
// 2. Create PaddlePredictor by MobileConfig
std::shared_ptr<PaddlePredictor> mobile_predictor =
CreatePaddlePredictor<MobileConfig>(mobile_config);
}
} // namespace lite_api
} // namespace paddle
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "kernel_src_map.h" // NOLINT #include "kernel_src_map.h" // NOLINT
#include "lite/api/cxx_api.h" #include "lite/api/cxx_api.h"
#include "lite/api/paddle_api.h" #include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
......
...@@ -99,7 +99,8 @@ enum class ActivationType : int { ...@@ -99,7 +99,8 @@ enum class ActivationType : int {
kTanh = 6, kTanh = 6,
kSwish = 7, kSwish = 7,
kExp = 8, kExp = 8,
NUM = 9, kAbs = 9,
NUM = 10,
}; };
static size_t PrecisionTypeLength(PrecisionType type) { static size_t PrecisionTypeLength(PrecisionType type) {
......
...@@ -36,7 +36,8 @@ void TestModel(const std::vector<Place>& valid_places) { ...@@ -36,7 +36,8 @@ void TestModel(const std::vector<Place>& valid_places) {
predictor.Build(FLAGS_model_dir, "", "", valid_places, passes); predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
auto* input_tensor = predictor.GetInput(0); auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224}))); input_tensor->Resize(DDim(
std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production(); auto item_size = input_tensor->dims().production();
if (FLAGS_input_img_txt_path.empty()) { if (FLAGS_input_img_txt_path.empty()) {
...@@ -67,15 +68,13 @@ void TestModel(const std::vector<Place>& valid_places) { ...@@ -67,15 +68,13 @@ void TestModel(const std::vector<Place>& valid_places) {
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average."; << " ms in average.";
auto* out = predictor.GetOutput(0); auto out = predictor.GetOutputs();
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
auto* out_data = out->data<float>();
FILE* fp = fopen("result.txt", "wb"); FILE* fp = fopen("result.txt", "wb");
for (int i = 0; i < out->numel(); i++) { for (int i = 0; i < out.size(); i++) {
fprintf(fp, "%f\n", out_data[i]); auto* out_data = out[i]->data<float>();
for (int j = 0; j < out[i]->numel(); j++) {
fprintf(fp, "%f\n", out_data[j]);
}
} }
fclose(fp); fclose(fp);
} }
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include <gflags/gflags.h> #include <gflags/gflags.h>
#ifdef PADDLE_WITH_TESTING
#include <gtest/gtest.h> #include <gtest/gtest.h>
#endif
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/api/cxx_api.h" #include "lite/api/cxx_api.h"
......
...@@ -29,6 +29,7 @@ enum class BinaryOperation { ...@@ -29,6 +29,7 @@ enum class BinaryOperation {
kADD = 0, kADD = 0,
kMUL = 1, kMUL = 1,
kDIV = 2, kDIV = 2,
kSUB = 3,
}; };
template <typename T> template <typename T>
...@@ -41,6 +42,7 @@ __device__ __forceinline__ float binary_calc(float x, ...@@ -41,6 +42,7 @@ __device__ __forceinline__ float binary_calc(float x,
if (type == BinaryOperation::kADD) return x + y; if (type == BinaryOperation::kADD) return x + y;
if (type == BinaryOperation::kMUL) return x * y; if (type == BinaryOperation::kMUL) return x * y;
if (type == BinaryOperation::kDIV) return x / y; if (type == BinaryOperation::kDIV) return x / y;
if (type == BinaryOperation::kSUB) return x - y;
} }
template <typename T> template <typename T>
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "lite/backends/opencl/cl_context.h" #include "lite/backends/opencl/cl_context.h"
#include <algorithm>
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility> #include <utility>
...@@ -35,8 +36,10 @@ cl::Program &CLContext::GetProgram(const std::string &file_name, ...@@ -35,8 +36,10 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
STL::stringstream program_key_ss; STL::stringstream program_key_ss;
program_key_ss << file_name << options; program_key_ss << file_name << options;
std::string program_key = program_key_ss.str(); std::string program_key = program_key_ss.str();
auto it = programs_.find(program_key);
if (it != programs_.end()) { auto &programs = CLRuntime::Global()->programs();
auto it = programs.find(program_key);
if (it != programs.end()) {
VLOG(3) << " --- program -> " << program_key << " has been built --- "; VLOG(3) << " --- program -> " << program_key << " has been built --- ";
return *(it->second); return *(it->second);
} }
...@@ -47,14 +50,15 @@ cl::Program &CLContext::GetProgram(const std::string &file_name, ...@@ -47,14 +50,15 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
CLRuntime::Global()->BuildProgram(program.get(), options); CLRuntime::Global()->BuildProgram(program.get(), options);
VLOG(3) << " --- end build program -> " << program_key << " --- "; VLOG(3) << " --- end build program -> " << program_key << " --- ";
programs_[program_key] = std::move(program); programs[program_key] = std::move(program);
return *(programs_[program_key]); return *(programs[program_key]);
} }
void CLContext::AddKernel(const std::string &kernel_name, void CLContext::AddKernel(const std::string &kernel_name,
const std::string &file_name, const std::string &file_name,
const std::string &options) { const std::string &options,
const std::string &time_stamp) {
cl_int status{CL_SUCCESS}; cl_int status{CL_SUCCESS};
VLOG(3) << " --- to get program " << file_name << " --- "; VLOG(3) << " --- to get program " << file_name << " --- ";
auto program = GetProgram(file_name, options); auto program = GetProgram(file_name, options);
...@@ -64,24 +68,29 @@ void CLContext::AddKernel(const std::string &kernel_name, ...@@ -64,24 +68,29 @@ void CLContext::AddKernel(const std::string &kernel_name,
new cl::Kernel(program, kernel_name.c_str(), &status)); new cl::Kernel(program, kernel_name.c_str(), &status));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
VLOG(3) << " --- end create kernel --- "; VLOG(3) << " --- end create kernel --- ";
kernels_.emplace_back(std::move(kernel));
auto &kernels = CLRuntime::Global()->kernels();
auto &kernel_offset_map = CLRuntime::Global()->kernel_offset();
kernels.emplace_back(std::move(kernel));
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_name << options; kernel_key << kernel_name << options << time_stamp;
kernel_offset_[kernel_key.str()] = kernels_.size() - 1; kernel_offset_map[kernel_key.str()] = kernels.size() - 1;
} }
cl::Kernel &CLContext::GetKernel(const int index) { cl::Kernel &CLContext::GetKernel(const int index) {
VLOG(3) << " --- kernel count: " << kernels_.size() << " --- "; auto &kernels = CLRuntime::Global()->kernels();
CHECK(static_cast<size_t>(index) < kernels_.size()) VLOG(3) << " --- kernel count: " << kernels.size() << " --- ";
CHECK(static_cast<size_t>(index) < kernels.size())
<< "The index must be less than the size of kernels."; << "The index must be less than the size of kernels.";
CHECK(kernels_[index] != nullptr) CHECK(kernels[index] != nullptr)
<< "The target kernel pointer cannot be null."; << "The target kernel pointer cannot be null.";
return *(kernels_[index]); return *(kernels[index]);
} }
cl::Kernel &CLContext::GetKernel(const std::string &name) { cl::Kernel &CLContext::GetKernel(const std::string &name) {
auto it = kernel_offset_.find(name); auto &kernel_offset_map = CLRuntime::Global()->kernel_offset();
CHECK(it != kernel_offset_.end()) << "Cannot find the kernel function: " auto it = kernel_offset_map.find(name);
CHECK(it != kernel_offset_map.end()) << "Cannot find the kernel function: "
<< name; << name;
return GetKernel(it->second); return GetKernel(it->second);
} }
...@@ -121,14 +130,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { ...@@ -121,14 +130,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
} }
} }
cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
size_t max_work_size,
int divisor) {
int preferred_lws = 0;
#if 1
auto gws0 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws2 = global_work_size[2];
#else
auto gws2 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws0 = global_work_size[2];
#endif
if (divisor > 1) {
max_work_size /= divisor;
}
if (preferred_lws > 0 && preferred_lws <= max_work_size) {
max_work_size = preferred_lws;
}
while (gws1 > max_work_size && max_work_size > 0) {
gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
}
while (gws2 * gws1 > max_work_size && max_work_size > 0) {
gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
}
while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
}
#if 1
return cl::NDRange{static_cast<size_t>(gws0),
static_cast<size_t>(gws1),
static_cast<size_t>(gws2)};
#else
return cl::NDRange{static_cast<size_t>(gws2),
static_cast<size_t>(gws1),
static_cast<size_t>(gws0)};
#endif
}
cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
size_t max_work_size) { size_t max_work_size) {
int preferred_lws = 0; int preferred_lws = 0;
int divisor = 2; int divisor = 2;
auto tmp0 = global_work_size[0]; auto gws0 = global_work_size[0];
auto tmp1 = global_work_size[1]; auto gws1 = global_work_size[1];
auto tmp2 = global_work_size[2]; auto gws2 = global_work_size[2];
if (divisor > 1) { if (divisor > 1) {
max_work_size /= divisor; max_work_size /= divisor;
...@@ -136,18 +184,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, ...@@ -136,18 +184,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
if (preferred_lws > 0 && preferred_lws <= max_work_size) { if (preferred_lws > 0 && preferred_lws <= max_work_size) {
max_work_size = preferred_lws; max_work_size = preferred_lws;
} }
while (tmp1 > max_work_size && max_work_size > 0) { while (gws1 > max_work_size && max_work_size > 0) {
tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
} }
while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { while (gws2 * gws1 > max_work_size && max_work_size > 0) {
tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
} }
while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
} }
return cl::NDRange{static_cast<size_t>(tmp0), return cl::NDRange{static_cast<size_t>(gws0),
static_cast<size_t>(tmp1), static_cast<size_t>(gws1),
static_cast<size_t>(tmp2)}; static_cast<size_t>(gws2)};
} }
} // namespace lite } // namespace lite
......
...@@ -36,7 +36,8 @@ class CLContext { ...@@ -36,7 +36,8 @@ class CLContext {
void AddKernel(const std::string &kernel_name, void AddKernel(const std::string &kernel_name,
const std::string &file_name, const std::string &file_name,
const std::string &options = ""); const std::string &options = "",
const std::string &time_stamp = "");
cl::Kernel &GetKernel(const int index); cl::Kernel &GetKernel(const int index);
...@@ -46,10 +47,11 @@ class CLContext { ...@@ -46,10 +47,11 @@ class CLContext {
cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size); cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
private: cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_; size_t max_work_size,
std::vector<std::unique_ptr<cl::Kernel>> kernels_; int divitor = 2);
std::map<std::string, int> kernel_offset_; // cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
// size_t max_work_size);
}; };
} // namespace lite } // namespace lite
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "lite/backends/opencl/cl_runtime.h" #include "lite/backends/opencl/cl_runtime.h"
#include <string> #include <string>
#include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
...@@ -29,10 +30,26 @@ CLRuntime* CLRuntime::Global() { ...@@ -29,10 +30,26 @@ CLRuntime* CLRuntime::Global() {
CLRuntime::~CLRuntime() { CLRuntime::~CLRuntime() {
if (command_queue_ != nullptr) { if (command_queue_ != nullptr) {
command_queue_->flush();
command_queue_->finish(); command_queue_->finish();
} }
// For controlling the destruction order:
for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
clReleaseKernel(kernels_[kidx]->get());
kernels_[kidx].reset();
}
kernels_.clear();
kernel_offset_.clear();
for (auto& p : programs_) {
clReleaseProgram(p.second->get());
}
programs_.clear();
// For controlling the destruction order
command_queue_&& clReleaseCommandQueue(command_queue_->get());
command_queue_.reset(); command_queue_.reset();
context_&& clReleaseContext(context_->get());
context_.reset(); context_.reset();
device_.reset(); device_.reset();
platform_.reset(); platform_.reset();
...@@ -73,14 +90,14 @@ cl::CommandQueue& CLRuntime::command_queue() { ...@@ -73,14 +90,14 @@ cl::CommandQueue& CLRuntime::command_queue() {
return *command_queue_; return *command_queue_;
} }
std::unique_ptr<cl::Program> CLRuntime::CreateProgram( std::shared_ptr<cl::Program> CLRuntime::CreateProgram(
const cl::Context& context, std::string file_name) { const cl::Context& context, std::string file_name) {
auto cl_file = opencl_kernels_files.find(file_name); auto cl_file = opencl_kernels_files.find(file_name);
std::string content(cl_file->second.begin(), cl_file->second.end()); std::string content(cl_file->second.begin(), cl_file->second.end());
cl::Program::Sources sources; cl::Program::Sources sources;
sources.push_back(content); sources.push_back(content);
auto prog = auto prog =
std::unique_ptr<cl::Program>(new cl::Program(context, sources, &status_)); std::shared_ptr<cl::Program>(new cl::Program(context, sources, &status_));
VLOG(4) << "OpenCL kernel file name: " << file_name; VLOG(4) << "OpenCL kernel file name: " << file_name;
VLOG(4) << "Program source size: " << content.size(); VLOG(4) << "Program source size: " << content.size();
CL_CHECK_FATAL(status_); CL_CHECK_FATAL(status_);
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_utility.h" #include "lite/backends/opencl/cl_utility.h"
...@@ -42,7 +43,7 @@ class CLRuntime { ...@@ -42,7 +43,7 @@ class CLRuntime {
cl::CommandQueue& command_queue(); cl::CommandQueue& command_queue();
std::unique_ptr<cl::Program> CreateProgram(const cl::Context& context, std::shared_ptr<cl::Program> CreateProgram(const cl::Context& context,
std::string file_name); std::string file_name);
std::unique_ptr<cl::UserEvent> CreateEvent(const cl::Context& context); std::unique_ptr<cl::UserEvent> CreateEvent(const cl::Context& context);
...@@ -57,6 +58,12 @@ class CLRuntime { ...@@ -57,6 +58,12 @@ class CLRuntime {
std::map<std::string, size_t>& GetDeviceInfo(); std::map<std::string, size_t>& GetDeviceInfo();
std::unordered_map<std::string, std::shared_ptr<cl::Program>>& programs() {
return programs_;
}
std::vector<std::unique_ptr<cl::Kernel>>& kernels() { return kernels_; }
std::map<std::string, int>& kernel_offset() { return kernel_offset_; }
private: private:
CLRuntime() = default; CLRuntime() = default;
...@@ -98,6 +105,12 @@ class CLRuntime { ...@@ -98,6 +105,12 @@ class CLRuntime {
std::shared_ptr<cl::CommandQueue> command_queue_{nullptr}; std::shared_ptr<cl::CommandQueue> command_queue_{nullptr};
std::unordered_map<std::string, std::shared_ptr<cl::Program>> programs_{};
std::vector<std::unique_ptr<cl::Kernel>> kernels_{};
std::map<std::string, int> kernel_offset_{};
cl_int status_{CL_SUCCESS}; cl_int status_{CL_SUCCESS};
bool initialized_{false}; bool initialized_{false};
......
...@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error); ...@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error);
__FILE__, \ __FILE__, \
__LINE__); \ __LINE__); \
} }
#ifndef LITE_SHUTDOWN_LOG
#define CL_CHECK_FATAL(err_code__) \ #define CL_CHECK_FATAL(err_code__) \
if (err_code__ != CL_SUCCESS) { \ if (err_code__ != CL_SUCCESS) { \
LOG(FATAL) << string_format( \ LOG(FATAL) << string_format( \
...@@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error); ...@@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error);
__FILE__, \ __FILE__, \
__LINE__); \ __LINE__); \
} }
#else
#define CL_CHECK_FATAL(err_code__)
#endif
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -181,7 +181,11 @@ class Context<TargetType::kCUDA> { ...@@ -181,7 +181,11 @@ class Context<TargetType::kCUDA> {
Env<TargetType::kCUDA>::Global(); Env<TargetType::kCUDA>::Global();
// NOTE: InitOnce should only be used by ContextScheduler // NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() { void InitOnce() {
if (devs.size() > 0) {
cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>(); cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
} else {
LOG(INFO) << "No cuda device(s) found, CUDAContext init failed.";
}
} }
void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) { void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
CHECK_GT(devs.size(), 0UL) CHECK_GT(devs.size(), 0UL)
......
...@@ -142,7 +142,7 @@ class Env { ...@@ -142,7 +142,7 @@ class Env {
// Get device count // Get device count
count = API::num_devices(); count = API::num_devices();
if (count == 0) { if (count == 0) {
CHECK(false) << "No device found!"; LOG(INFO) << "No " << TargetToStr(Type) << " device(s) found!";
} else { } else {
LOG(INFO) << "Found " << count << " device(s)"; LOG(INFO) << "Found " << count << " device(s)";
} }
......
...@@ -26,7 +26,8 @@ namespace mir { ...@@ -26,7 +26,8 @@ namespace mir {
void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) { void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// initialze fuser params // initialze fuser params
std::vector<bool> conv_has_bias_cases{true, false}; std::vector<bool> conv_has_bias_cases{true, false};
std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"}; std::vector<std::string> conv_type_cases{
"conv2d", "depthwise_conv2d", "conv2d_transpose"};
// start fuse using params // start fuse using params
for (auto conv_has_bias : conv_has_bias_cases) { for (auto conv_has_bias : conv_has_bias_cases) {
for (auto conv_type : conv_type_cases) { for (auto conv_type : conv_type_cases) {
......
...@@ -103,10 +103,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -103,10 +103,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
std::string conv_weight_name = matched.at("conv_weight")->arg()->name; std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
auto conv_weight_t = auto conv_weight_t =
scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>(); scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
if (conv_type_ == "conv2d_transpose") {
CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
static_cast<size_t>(conv_weight_t->dims()[1]))
<< "The BN bias's size should be equal to the size of the first "
<< "dim size of the conv weights";
} else {
CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()), CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
static_cast<size_t>(conv_weight_t->dims()[0])) static_cast<size_t>(conv_weight_t->dims()[0]))
<< "The BN bias's size should be equal to the size of the first " << "The BN bias's size should be equal to the size of the first "
<< "dim size of the conv weights"; << "dim size of the conv weights";
}
size_t weight_num = conv_weight_t->data_size(); size_t weight_num = conv_weight_t->data_size();
bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false; bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
bool is_weight_quantization = bool is_weight_quantization =
...@@ -153,6 +160,22 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -153,6 +160,22 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
// compute new conv_weight for int8 // compute new conv_weight for int8
auto weight_scale = auto weight_scale =
conv_op_desc->GetAttr<std::vector<float>>("weight_scale"); conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
if (conv_type_ == "conv2d_transpose") {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
for (unsigned int i = 0; i < h; ++i) {
weight_scale[i] *= fabsf(alpha_data[i]);
if (alpha_data[i] < 0.f) {
auto ptr_row = conv_weight_d + k * c_size + i * hw;
for (unsigned int j = 0; j < hw; ++j) {
ptr_row[j] *= -1;
}
}
}
}
} else {
for (unsigned int i = 0; i < h; ++i) { for (unsigned int i = 0; i < h; ++i) {
weight_scale[i] *= fabsf(alpha_data[i]); weight_scale[i] *= fabsf(alpha_data[i]);
if (alpha_data[i] < 0.f) { if (alpha_data[i] < 0.f) {
...@@ -162,6 +185,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -162,6 +185,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
} }
} }
} }
}
conv_op_desc->SetAttr("weight_scale", weight_scale); conv_op_desc->SetAttr("weight_scale", weight_scale);
} else if (is_weight_quantization) { } else if (is_weight_quantization) {
std::string scale_name = conv_weight_name + "_quant_scale"; std::string scale_name = conv_weight_name + "_quant_scale";
...@@ -176,12 +200,26 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -176,12 +200,26 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
} else { } else {
// compute new conv_weight // compute new conv_weight
auto conv_weight_d = conv_weight_t->mutable_data<float>(); auto conv_weight_d = conv_weight_t->mutable_data<float>();
if (conv_type_ == "conv2d_transpose") {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
for (unsigned int i = 0; i < h; ++i) {
auto ptr_row = conv_weight_d + k * c_size + i * hw;
for (unsigned int j = 0; j < hw; ++j) {
ptr_row[j] *= alpha_data[i];
}
}
}
} else {
for (unsigned int i = 0; i < h; ++i) { // n: conv2d output channels for (unsigned int i = 0; i < h; ++i) { // n: conv2d output channels
for (unsigned int j = 0; j < w; ++j) { // w: conv2d input channels for (unsigned int j = 0; j < w; ++j) { // w: conv2d input channels
conv_weight_d[i * w + j] *= alpha_data[i]; conv_weight_d[i * w + j] *= alpha_data[i];
} }
} }
} }
}
// compute new conv_bias // compute new conv_bias
if (conv_has_bias_ && conv_op_desc->HasInput("Bias") && if (conv_has_bias_ && conv_op_desc->HasInput("Bias") &&
......
...@@ -44,11 +44,9 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -44,11 +44,9 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser(graph.get()); fuser(graph.get());
} }
// delete quant_dequant_node // process quant_dequant_node
for (auto op_type : {"pool2d", "softmax", "elementwise_add"}) { fusion::DeleteQuantDequantOpFuser dqd_fuser;
fusion::DeleteQuantDequantOpFuser fuser(op_type); dqd_fuser(graph.get());
fuser(graph.get());
}
} }
} // namespace mir } // namespace mir
......
...@@ -50,7 +50,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -50,7 +50,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
auto* output_scale_node = matched.at("output_scale_node"); auto* output_scale_node = matched.at("output_scale_node");
auto* output_act_node = matched.at("output_act_node"); auto* output_act_node = matched.at("output_act_node");
// obtain values, save values and relink node // obtain scale, save attrs and relink node
int bit_length = quant_node->stmt()->op_info()->GetAttr<int>("bit_length"); int bit_length = quant_node->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1); int range = ((1 << (bit_length - 1)) - 1);
auto* scope = quant_node->stmt()->op()->scope(); auto* scope = quant_node->stmt()->op()->scope();
...@@ -58,11 +58,22 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -58,11 +58,22 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
->GetMutable<lite::Tensor>(); ->GetMutable<lite::Tensor>();
float scale_value = scale_tensor->data<float>()[0] / range; float scale_value = scale_tensor->data<float>()[0] / range;
auto in_act_name = input_act_node->arg()->name;
auto out_act_name = output_act_node->arg()->name;
auto outlinks = output_act_node->outlinks; auto outlinks = output_act_node->outlinks;
for (auto* quantized_node : outlinks) { for (auto* quantized_node : outlinks) {
auto* op_desc = quantized_node->stmt()->mutable_op_info(); // save input scale in quantized op by input argname + index
op_desc->SetAttr<int>("bit_length", bit_length); auto op_desc = *quantized_node->stmt()->mutable_op_info();
op_desc->SetAttr<float>("input_scale", scale_value); std::string argname;
int index;
op_desc.GetInputArgname(out_act_name, &argname);
op_desc.GetInputIndex(out_act_name, &index);
op_desc.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
scale_value);
op_desc.SetAttr<float>("input_scale", scale_value); // save it for now
op_desc.SetAttr<int>("bit_length", bit_length);
op_desc.UpdateAllInputs(out_act_name, in_act_name);
quantized_node->stmt()->ResetOp(op_desc, graph->valid_places());
IR_NODE_LINK_TO(input_act_node, quantized_node) IR_NODE_LINK_TO(input_act_node, quantized_node)
} }
...@@ -125,18 +136,17 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -125,18 +136,17 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
auto* dequant_op = matched.at("dequant_op"); auto* dequant_op = matched.at("dequant_op");
auto* dequant_op_out = matched.at("dequant_op_out"); auto* dequant_op_out = matched.at("dequant_op_out");
// obtain input_scale and weight_scale // obtain weight_scale from max_range
auto* scope = quantized_op->stmt()->op()->scope(); auto* scope = quantized_op->stmt()->op()->scope();
auto& valid_places = quantized_op->stmt()->op()->valid_places(); auto& valid_places = quantized_op->stmt()->op()->valid_places();
int bit_length = quantized_op->stmt()->op_info()->GetAttr<int>("bit_length"); int bit_length = quantized_op->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1); int range = ((1 << (bit_length - 1)) - 1);
float input_scale =
quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
float max_range = dequant_op->stmt()->op_info()->GetAttr<float>("max_range"); float max_range = dequant_op->stmt()->op_info()->GetAttr<float>("max_range");
float whole_weight_scale = float whole_weight_scale =
static_cast<float>(range * range) / max_range / range; static_cast<float>(range * range) / max_range / range;
// max_range = range * range / max(abs(weight)) // As: max_range = range * range / max(abs(weight))
// weight_scale = range * range / (range * range / max(abs(weight))) / range // So: whole_weight_scale
// = range * range / (range * range / max(abs(weight))) / range
// = max(abs(weight)) / range // = max(abs(weight)) / range
// set op desc // set op desc
...@@ -153,7 +163,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -153,7 +163,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
// Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
// be Cout. // be Cout.
weight_scale_size = quantized_weight_t->dims()[0]; weight_scale_size = quantized_weight_t->dims()[0];
} else if (quantized_op_type_ == "mul") { } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") {
op_desc.SetInput("X", {quantized_op_input->arg()->name}); op_desc.SetInput("X", {quantized_op_input->arg()->name});
op_desc.SetOutput("Out", {dequant_op_out->arg()->name}); op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
// Fc weight: Cin * Cout, the weight_scale_size should be Cout. // Fc weight: Cin * Cout, the weight_scale_size should be Cout.
...@@ -163,7 +173,6 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -163,7 +173,6 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
weight_scale.push_back(whole_weight_scale); weight_scale.push_back(whole_weight_scale);
} }
op_desc.SetAttr("enable_int8", true); op_desc.SetAttr("enable_int8", true);
op_desc.SetAttr("input_scale", input_scale);
op_desc.SetAttr("weight_scale", weight_scale); op_desc.SetAttr("weight_scale", weight_scale);
// change the weight from the float type to int8 type. // change the weight from the float type to int8 type.
...@@ -209,6 +218,7 @@ void ChannelWiseDequantOpFuser::BuildPattern() { ...@@ -209,6 +218,7 @@ void ChannelWiseDequantOpFuser::BuildPattern() {
->assert_is_op_output(quantized_op_type_) ->assert_is_op_output(quantized_op_type_)
->assert_is_op_input(dequant_op_type, "X") ->assert_is_op_input(dequant_op_type, "X")
->AsIntermediate(); ->AsIntermediate();
// The scale var_node of input activation is deleted in DeleteQuantOpFuser
auto* dequant_op_channel_scale = VarNode("dequant_op_channel_scale") auto* dequant_op_channel_scale = VarNode("dequant_op_channel_scale")
->assert_is_op_input(dequant_op_type) ->assert_is_op_input(dequant_op_type)
->AsIntermediate(); ->AsIntermediate();
...@@ -237,11 +247,9 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -237,11 +247,9 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
auto* dequant_op = matched.at("dequant_op"); auto* dequant_op = matched.at("dequant_op");
auto* dequant_op_out = matched.at("dequant_op_out"); auto* dequant_op_out = matched.at("dequant_op_out");
// obtain input_scale and weight_scale // obtain input weight_scale from fake_dequant op
auto* scope = quantized_op->stmt()->op()->scope(); auto* scope = quantized_op->stmt()->op()->scope();
auto& valid_places = quantized_op->stmt()->op()->valid_places(); auto& valid_places = quantized_op->stmt()->op()->valid_places();
float input_scale =
quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
std::vector<float> weight_scale; std::vector<float> weight_scale;
std::vector<int> quant_bits = std::vector<int> quant_bits =
...@@ -258,11 +266,15 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -258,11 +266,15 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
// set op desc // set op desc
cpp::OpDesc op_desc = *quantized_op->stmt()->op_info(); cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
if (quantized_op_type_ == "conv2d" ||
quantized_op_type_ == "depthwise_conv2d") {
op_desc.SetInput("Input", {quantized_op_input->arg()->name}); op_desc.SetInput("Input", {quantized_op_input->arg()->name});
op_desc.SetOutput("Output", {dequant_op_out->arg()->name}); op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
} else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") {
op_desc.SetInput("X", {quantized_op_input->arg()->name});
op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
}
op_desc.SetAttr("enable_int8", true); op_desc.SetAttr("enable_int8", true);
op_desc.SetAttr("input_scale", input_scale);
op_desc.SetAttr("weight_scale", weight_scale); op_desc.SetAttr("weight_scale", weight_scale);
// change the weight from the float type to int8 type. // change the weight from the float type to int8 type.
...@@ -297,14 +309,12 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) { ...@@ -297,14 +309,12 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
void DeleteQuantDequantOpFuser::BuildPattern() { void DeleteQuantDequantOpFuser::BuildPattern() {
std::string quant_dequant_op_type = std::string quant_dequant_op_type =
"fake_quantize_dequantize_moving_average_abs_max"; "fake_quantize_dequantize_moving_average_abs_max";
if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") {
auto* input_scale_node = auto* input_scale_node =
VarNode("input_scale_node") VarNode("input_scale_node")
->assert_is_op_input(quant_dequant_op_type, "InScale"); ->assert_is_op_input(quant_dequant_op_type, "InScale");
auto* input_act_node = VarNode("input_act_node") auto* input_act_node =
->assert_is_op_input(quant_dequant_op_type, "X"); VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X");
auto* quant_dequant_node = auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type)
OpNode("quant_dequant_node", quant_dequant_op_type)
->assert_is_op(quant_dequant_op_type); ->assert_is_op(quant_dequant_op_type);
auto* output_scale_node = auto* output_scale_node =
VarNode("output_scale_node") VarNode("output_scale_node")
...@@ -312,77 +322,23 @@ void DeleteQuantDequantOpFuser::BuildPattern() { ...@@ -312,77 +322,23 @@ void DeleteQuantDequantOpFuser::BuildPattern() {
auto* output_act_node = auto* output_act_node =
VarNode("output_act_node") VarNode("output_act_node")
->assert_is_op_output(quant_dequant_op_type, "Out"); ->assert_is_op_output(quant_dequant_op_type, "Out");
auto* quantized_node = OpNode("quantized_node", quantized_op_type_)
->assert_is_op(quantized_op_type_);
quant_dequant_node->LinksFrom({input_scale_node, input_act_node}); quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
output_scale_node->LinksFrom({quant_dequant_node}); output_scale_node->LinksFrom({quant_dequant_node});
output_act_node->LinksFrom({quant_dequant_node}); output_act_node->LinksFrom({quant_dequant_node});
quantized_node->LinksFrom({output_act_node});
} else if (quantized_op_type_ == "elementwise_add") {
auto* input_scale_left_node =
VarNode("input_scale_left_node")
->assert_is_op_input(quant_dequant_op_type, "InScale");
auto* input_act_left_node =
VarNode("input_act_left_node")
->assert_is_op_input(quant_dequant_op_type, "X");
auto* quant_dequant_left_node =
OpNode("quant_dequant_left_node", quant_dequant_op_type)
->assert_is_op(quant_dequant_op_type);
auto* output_scale_left_node =
VarNode("output_scale_left_node")
->assert_is_op_output(quant_dequant_op_type, "OutScale");
auto* output_act_left_node =
VarNode("output_act_left_node")
->assert_is_op_output(quant_dequant_op_type, "Out")
->assert_is_op_input(quantized_op_type_, "X");
quant_dequant_left_node->LinksFrom(
{input_scale_left_node, input_act_left_node});
output_scale_left_node->LinksFrom({quant_dequant_left_node});
output_act_left_node->LinksFrom({quant_dequant_left_node});
auto* input_scale_right_node =
VarNode("input_scale_right_node")
->assert_is_op_input(quant_dequant_op_type, "InScale");
auto* input_act_right_node =
VarNode("input_act_right_node")
->assert_is_op_input(quant_dequant_op_type, "X");
auto* quant_dequant_right_node =
OpNode("quant_dequant_right_node", quant_dequant_op_type)
->assert_is_op(quant_dequant_op_type);
auto* output_scale_right_node =
VarNode("output_scale_right_node")
->assert_is_op_output(quant_dequant_op_type, "OutScale");
auto* output_act_right_node =
VarNode("output_act_right_node")
->assert_is_op_output(quant_dequant_op_type, "Out")
->assert_is_op_input(quantized_op_type_, "Y");
quant_dequant_right_node->LinksFrom(
{input_scale_right_node, input_act_right_node});
output_scale_right_node->LinksFrom({quant_dequant_right_node});
output_act_right_node->LinksFrom({quant_dequant_right_node});
auto* quantized_node = OpNode("quantized_node", quantized_op_type_)
->assert_is_op(quantized_op_type_);
quantized_node->LinksFrom({output_act_left_node, output_act_right_node});
} else {
LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_;
}
VLOG(4) << "DeleteQuantDequantOpFuser BuildPattern op_type:"
<< quantized_op_type_;
} }
void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) { const key2nodes_t& matched) {
if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") {
auto* input_scale_node = matched.at("input_scale_node"); auto* input_scale_node = matched.at("input_scale_node");
auto* input_act_node = matched.at("input_act_node"); auto* input_act_node = matched.at("input_act_node");
auto* quant_dequant_node = matched.at("quant_dequant_node"); auto* quant_dequant_node = matched.at("quant_dequant_node");
auto* output_scale_node = matched.at("output_scale_node"); auto* output_scale_node = matched.at("output_scale_node");
auto* output_act_node = matched.at("output_act_node"); auto* output_act_node = matched.at("output_act_node");
auto* quantized_node = matched.at("quantized_node"); auto input_act_name = input_act_node->arg()->name;
auto output_act_name = output_act_node->arg()->name;
// obtain values, save values and relink node // Get scale value from scale var node
int bit_length = int bit_length =
quant_dequant_node->stmt()->op_info()->GetAttr<int>("bit_length"); quant_dequant_node->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1); int range = ((1 << (bit_length - 1)) - 1);
...@@ -391,73 +347,27 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -391,73 +347,27 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
->GetMutable<lite::Tensor>(); ->GetMutable<lite::Tensor>();
float scale_value = scale_tensor->data<float>()[0] / range; float scale_value = scale_tensor->data<float>()[0] / range;
auto* op_desc = quantized_node->stmt()->mutable_op_info(); auto quantized_nodes = output_act_node->outlinks;
op_desc->SetAttr<int>("bit_length", bit_length); for (auto* quantized_node : quantized_nodes) {
op_desc->SetAttr<float>("input_scale", scale_value); // Save quantization info in op_info attr
op_desc->SetInput("X", {input_act_node->arg()->name}); auto op_info = *quantized_node->stmt()->op_info();
IR_NODE_LINK_TO(input_act_node, quantized_node) std::string argname;
auto update_op_desc = *quantized_node->stmt()->mutable_op_info(); int index;
quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places()); op_info.GetInputArgname(output_act_name, &argname);
op_info.GetInputIndex(output_act_name, &index);
// delete nodes and edges op_info.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
std::unordered_set<const Node*> nodes2rm = {input_scale_node, scale_value);
quant_dequant_node, op_info.SetAttr<float>("input_scale", scale_value); // Save it for now
output_scale_node, op_info.SetAttr<int>("bit_length", bit_length);
output_act_node};
GraphSafeRemoveNodes(graph, nodes2rm); op_info.UpdateAllInputs(output_act_name, input_act_name);
} else if (quantized_op_type_ == "elementwise_add") { quantized_node->stmt()->ResetOp(op_info, graph->valid_places());
auto* input_scale_left_node = matched.at("input_scale_left_node"); IR_NODE_LINK_TO(input_act_node, quantized_node);
auto* input_act_left_node = matched.at("input_act_left_node"); }
auto* quant_dequant_left_node = matched.at("quant_dequant_left_node");
auto* output_scale_left_node = matched.at("output_scale_left_node");
auto* output_act_left_node = matched.at("output_act_left_node");
auto* input_scale_right_node = matched.at("input_scale_right_node");
auto* input_act_right_node = matched.at("input_act_right_node");
auto* quant_dequant_right_node = matched.at("quant_dequant_right_node");
auto* output_scale_right_node = matched.at("output_scale_right_node");
auto* output_act_right_node = matched.at("output_act_right_node");
auto* quantized_node = matched.at("quantized_node");
// obtain values, save values and relink node
int bit_length =
quant_dequant_left_node->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1);
auto* scope = quant_dequant_left_node->stmt()->op()->scope();
auto* left_scale_tensor =
scope->FindVar(output_scale_left_node->arg()->name)
->GetMutable<lite::Tensor>();
float left_scale_value = left_scale_tensor->data<float>()[0] / range;
auto* right_scale_tensor =
scope->FindVar(output_scale_right_node->arg()->name)
->GetMutable<lite::Tensor>();
float right_scale_value = right_scale_tensor->data<float>()[0] / range;
auto* op_desc = quantized_node->stmt()->mutable_op_info();
op_desc->SetAttr<int>("bit_length", bit_length);
op_desc->SetAttr<float>("x_input_scale", left_scale_value);
op_desc->SetAttr<float>("y_input_scale", right_scale_value);
op_desc->SetInput("X", {input_act_left_node->arg()->name});
op_desc->SetInput("Y", {input_act_right_node->arg()->name});
IR_NODE_LINK_TO(input_act_left_node, quantized_node)
IR_NODE_LINK_TO(input_act_right_node, quantized_node)
auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
// delete nodes and edges // delete nodes and edges
std::unordered_set<const Node*> nodes2rm = {input_scale_left_node, std::unordered_set<const Node*> nodes2rm = {
quant_dequant_left_node, input_scale_node, quant_dequant_node, output_scale_node, output_act_node};
output_scale_left_node,
output_act_left_node,
input_scale_right_node,
quant_dequant_right_node,
output_scale_right_node,
output_act_right_node};
GraphSafeRemoveNodes(graph, nodes2rm); GraphSafeRemoveNodes(graph, nodes2rm);
} else {
LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_;
}
} }
cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) { cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
......
...@@ -87,24 +87,16 @@ class ChannelWiseDequantOpFuser : public FuseBase { ...@@ -87,24 +87,16 @@ class ChannelWiseDequantOpFuser : public FuseBase {
}; };
/* The pattern like "fake_quantize_dequantize_moving_average_abs_max + /* The pattern like "fake_quantize_dequantize_moving_average_abs_max +
* pooled/elementwise_add" can be deteted by this fuser. The fuser * quantized_op" can be deteted by this fuser. The fuser modifies the input
* extract the input_scale form fake_quant_dequant_op and save into * scale for the quantized_op and deletes the fake_quant_dequant_op.
* the quantized_op. Besides, the fuser delete fake_quant_dequant_op in
* the graph.
*/ */
class DeleteQuantDequantOpFuser : public FuseBase { class DeleteQuantDequantOpFuser : public FuseBase {
public: public:
explicit DeleteQuantDequantOpFuser(const std::string& quantized_op_type)
: quantized_op_type_(quantized_op_type) {}
void BuildPattern() override; void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private: private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
private:
std::string quantized_op_type_{};
}; };
} // namespace fusion } // namespace fusion
......
...@@ -225,6 +225,32 @@ class OpInfo : public cpp::OpDesc { ...@@ -225,6 +225,32 @@ class OpInfo : public cpp::OpDesc {
return false; return false;
} }
// For the input variable name, find the index of the corresponding
// input argname
bool GetInputIndex(const std::string &value_name, int *out) const {
for (auto &item : inputs_) {
auto it = std::find(item.second.begin(), item.second.end(), value_name);
if (it != item.second.end()) {
*out = it - item.second.begin();
return true;
}
}
return false;
}
// For the output variable name, find the index of the corresponding
// output argname
bool GetOutputIndex(const std::string &value_name, int *out) const {
for (auto &item : outputs_) {
auto it = std::find(item.second.begin(), item.second.end(), value_name);
if (it != item.second.end()) {
*out = it - item.second.begin();
return true;
}
}
return false;
}
void UpdateAllInputs(const std::string &from, const std::string &to) { void UpdateAllInputs(const std::string &from, const std::string &to) {
for (auto &item : inputs_) { for (auto &item : inputs_) {
for (auto &var : item.second) { for (auto &var : item.second) {
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
* of each kernel. * of each kernel.
*/ */
#pragma once #pragma once
#include <cmath>
#include <string> #include <string>
#include <vector> #include <vector>
#include "lite/core/program.h" #include "lite/core/program.h"
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "lite/operators/conditional_block_op.h" #include "lite/operators/conditional_block_op.h"
#include "lite/operators/subgraph_op.h" #include "lite/operators/subgraph_op.h"
#include "lite/operators/while_op.h" #include "lite/operators/while_op.h"
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE
#include "lite/core/profile/precision_profiler.h" #include "lite/core/profile/precision_profiler.h"
#endif #endif
...@@ -136,12 +136,10 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { ...@@ -136,12 +136,10 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
} }
void RuntimeProgram::Run() { void RuntimeProgram::Run() {
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE
auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler(); auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
std::string precision_profiler_summary = std::string precision_profiler_summary =
inst_precision_profiler.GetSummaryHeader(); inst_precision_profiler.GetSummaryHeader();
#endif
#endif #endif
for (auto& inst : instructions_) { for (auto& inst : instructions_) {
...@@ -149,21 +147,19 @@ void RuntimeProgram::Run() { ...@@ -149,21 +147,19 @@ void RuntimeProgram::Run() {
if (inst.is_feed_fetch_op()) continue; if (inst.is_feed_fetch_op()) continue;
#endif #endif
inst.Run(); inst.Run();
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE
#ifndef LITE_WITH_FPGA #ifndef LITE_WITH_FPGA
precision_profiler_summary += precision_profiler_summary +=
inst_precision_profiler.GetInstPrecision(&inst); inst_precision_profiler.GetInstPrecision(&inst);
#endif #endif
#endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
} }
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
#endif
#ifdef LITE_WITH_PRECISION_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE
LOG(INFO) << "\n" << precision_profiler_summary; LOG(INFO) << "\n" << precision_profiler_summary;
#endif // LITE_WITH_PRECISION_PROFILE #endif
#endif // LITE_WITH_PROFILE
} }
void Program::Build(const cpp::ProgramDesc& prog) { void Program::Build(const cpp::ProgramDesc& prog) {
......
# Introduction
我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。
你可以通过book库中的
[文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html)
[源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)
进一步了解“波士顿房价预测”这一任务的定义及其建模过程,
其使用线性回归(Linear Regression)
模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。
注:这是一篇使用C++ API做模型训练的教程,其他API暂时不支持训练功能。
# Requirements
- 一部安卓手机,用于运行训练程序
- 装了Paddle (version: 1.7.0) 的python
# Quick start
## Step1 build paddle-lite
请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为:
```shell
## 配置环境
wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz --no-check-certificate
tar xzf cmake-3.10.3-Linux-x86_64.tar.gz
export PATH=${PWD}'/cmake-3.10.3-Linux-x86_64/bin':$PATH
wget https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
unzip android-ndk-r17c-linux-x86_64.zip
export NDK_ROOT=/opt/android-ndk-r17c
## 编译
git clone https://github.com/PaddlePaddle/Paddle-Lite.git
cd Paddle-Lite
./lite/tools/build.sh \
--arm_os=android \
--arm_abi=armv7 \
--build_extra=ON \
--arm_lang=gcc \
--android_stl=c++_static \
--build_train=ON full_publish
```
产物:
```shell
Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so
```
## Step2 编译lr_trainer
```shell
cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/
sh run_build.sh /path/to/your/Paddle-Lite/build.lite.android.armv7.gcc/ /path/to/your/android-ndk-r17c
```
产物:
```shell
bin/
`-- demo_trainer
```
## Step3 download model and run it!
在你的笔记本电脑上,用usb连接到手机,开启开发者模式,在任意目录下执行:
```shell
local_path=/data/local/tmp/linear_regression
adb shell "mkdir "${local_path}
# download model and push to mobile
wget http://paddle-tar.bj.bcebos.com/paddle-lite/lite_lr_model.tar.gz
tar -zxvf lite_lr_model.tar.gz
adb push lite_lr_model/housing.data ${local_path}
adb push lite_lr_model/model_dir ${local_path}
# push lib and executable file to moblie
adb push libpaddle_full_api_shared.so ${local_path}
adb push demo_trainer ${local_path}
adb shell chmod +x ${local_path}/demo_trainer
# run it!
adb shell "export LD_LIBRARY_PATH="${local_path}" && export LIBRARY_PATH="${local_path}" && cd "${local_path}" && ./demo_trainer true"
```
期望结果:
```
sample 0: Loss: 564.317
sample 1: Loss: 463.9
sample 2: Loss: 1197.54
sample 3: Loss: 1093.83
sample 4: Loss: 1282.76
sample 5: Loss: 792.097
sample 6: Loss: 491.776
sample 7: Loss: 698.496
sample 8: Loss: 248.445
sample 9: Loss: 325.135
```
# 更多细节
上面提到的模型是直接下载得到的,如果你想自己生成,可以执行以下命令:
```shell
git clone https://github.com/PaddlePaddle/Paddle-Lite.git
cd Paddle-Lite/lite/demo/cxx/train_demo/
python train.py --save_model
```
产物:
```shell
model_dir/
|-- fc_0.b_0
|-- fc_0.w_0
|-- learning_rate_0
`-- __model__
md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d
```
如果你想生成自己的模型用于训练,可以参考`train.py`中保存模型的方式。
# 与Paddle训练结果做校对
## 前10个Loss值
为了验证paddle与lite的一致性,我们控制模型参数一致、数据一致、batch size = 1的情况下,训练10个batch, 记录了二者的loss值。
python + paddle 命令:
```shell
fluid train.py --num_steps=10 --batch_size=1
```
python + paddle 结果:
```shell
Train cost, Step 0, Cost 564.317017
Train cost, Step 1, Cost 463.900238
Train cost, Step 2, Cost 1197.537354
Train cost, Step 3, Cost 1093.833008
Train cost, Step 4, Cost 1282.760254
Train cost, Step 5, Cost 792.097351
Train cost, Step 6, Cost 491.775848
Train cost, Step 7, Cost 698.496033
Train cost, Step 8, Cost 248.444885
Train cost, Step 9, Cost 325.135132
```
c++ 与 paddle-lite命令:
```
./demo_trainer true
```
c++ 与 paddle-lite结果:
```
sample 0: Loss: 564.317
sample 1: Loss: 463.9
sample 2: Loss: 1197.54
sample 3: Loss: 1093.83
sample 4: Loss: 1282.76
sample 5: Loss: 792.097
sample 6: Loss: 491.776
sample 7: Loss: 698.496
sample 8: Loss: 248.445
sample 9: Loss: 325.135
```
## Loss 曲线
控制训练时的batch size为20,每个epoch对训练数据做全局shuffle,训练100个epoch后,paddle和lite的loss曲线对比如下。
![lr_loss](image/lr_loss.png)
如果想复现上述效果,paddle+python的运行命令为:
```
git clone https://github.com/PaddlePaddle/book.git
cd book/01.fit_a_line
python train.py
```
lite + c++的运行命令为:
```
./demo_trainer false
```
cmake_minimum_required(VERSION 2.8)
set (CMAKE_CXX_STANDARD 11)
# Project's name
if(NOT DEFINED LITE_ROOT)
message(FATAL_ERROR "please set LITE_ROOT with
-DLITE_ROOT=/path/to/your/build.lite.android.armv7.gcc/")
endif()
project(demo_trainer)
# Set the output folder where your program will be created
set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin)
set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
# The following folder will be included
include_directories("include")
include_directories("${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/include")
add_executable(demo_trainer ${PROJECT_SOURCE_DIR}/demo_trainer.cc ${PROJECT_SOURCE_DIR}/data_reader.cc)
TARGET_LINK_LIBRARIES(demo_trainer
"${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so")
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/data_reader.h"
#include <limits>
using std::string;
using std::vector;
int FEATURE_NUM = 13;
float rate = 0.8;
int get_samples(string line, vector<float>* feature, float* label) {
std::istringstream reader(line);
std::vector<float> numbers;
do {
// read as many numbers as possible.
for (float number; reader >> number;) {
numbers.push_back(number);
}
// consume and discard token from stream.
if (reader.fail()) {
reader.clear();
std::string token;
reader >> token;
}
} while (!reader.eof());
assert(numbers.size() == FEATURE_NUM + 1);
for (int i = 0; i < FEATURE_NUM; i++) {
feature->push_back(numbers[i]);
}
*label = numbers[FEATURE_NUM];
return 0;
}
int normalize(const vector<vector<float>>& origin_features,
vector<vector<float>>* features,
float rate) {
int inf = std::numeric_limits<int>::max();
vector<float> min_vec(FEATURE_NUM, static_cast<float>(inf));
vector<float> max_vec(FEATURE_NUM, -(static_cast<float>(inf)));
vector<float> sum_vec(FEATURE_NUM, 0);
vector<float> avg_vec(FEATURE_NUM, 0);
for (int i = 0; i < origin_features.size(); i++) {
for (int j = 0; j < FEATURE_NUM; j++) {
min_vec[j] = min(min_vec[j], origin_features[i][j]);
max_vec[j] = max(max_vec[j], origin_features[i][j]);
sum_vec[j] += origin_features[i][j];
}
}
for (int i = 0; i < FEATURE_NUM; i++) {
avg_vec[i] = sum_vec[i] / origin_features.size();
}
for (int i = 0; i < origin_features.size() * rate - 1; i++) {
vector<float> feat;
for (int j = 0; j < FEATURE_NUM; j++) {
feat.push_back((origin_features[i][j] - avg_vec[j]) /
(max_vec[j] - min_vec[j]));
}
features->push_back(feat);
}
}
int read_samples(const string fname,
vector<vector<float>>* features,
vector<float>* labels) {
fstream fin;
fin.open(fname);
if (!static_cast<bool>(fin)) {
return 1;
}
vector<vector<float>> origin_features;
vector<string> lines;
string line;
while (getline(fin, line)) {
lines.push_back(line);
}
fin.close();
for (int i = 0; i < lines.size(); i++) {
vector<float> feat;
float lbl = 0;
get_samples(lines[i], &feat, &lbl);
origin_features.push_back(feat);
if (i < lines.size() * rate - 1) {
labels->push_back(lbl);
}
}
cout << "finish read fata" << endl;
normalize(origin_features, features, rate);
assert(features->size() == labels->size());
return 0;
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <math.h>
#include <algorithm>
#include <iomanip>
#include <iostream>
#include <vector>
#include "include/data_reader.h"
#include "paddle_api.h" // NOLINT
using namespace paddle::lite_api; // NOLINT
class LRModel {
public:
void InitModel() {
// 1. Set CxxConfig
CxxConfig config;
config.set_model_dir("model_dir");
std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
config.set_valid_places(valid_places);
predictor_ = CreatePaddlePredictor<CxxConfig>(config);
}
float Predict(const vector<vector<float>>& features,
const vector<float>& labels) {
// Create Tensor
assert(features.size() == labels.size());
int batch_size = features.size();
std::unique_ptr<Tensor> input_tensor(std::move(predictor_->GetInput(0)));
input_tensor->Resize(shape_t({batch_size, FEATURE_NUM}));
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < batch_size; i++) {
for (int j = 0; j < FEATURE_NUM; j++) {
data[FEATURE_NUM * i + j] = features[i][j];
}
}
std::unique_ptr<Tensor> y_tensor(std::move(predictor_->GetInput(1)));
y_tensor->Resize(shape_t({batch_size, 1}));
auto* y_data = y_tensor->mutable_data<float>();
for (int i = 0; i < batch_size; i++) {
y_data[i] = labels[i];
}
predictor_->Run();
std::unique_ptr<const Tensor> output_tensor(
std::move(predictor_->GetOutput(0)));
return output_tensor->data<float>()[0];
}
private:
std::shared_ptr<PaddlePredictor> predictor_;
};
int shuffle(vector<vector<float>>* features, vector<float>* labels) {
assert(features->size() == labels->size());
vector<int> index;
for (int i = 0; i < features->size(); i++) {
index.push_back(i);
}
random_shuffle(index.begin(), index.end());
vector<vector<float>> tmp_features;
vector<float> tmp_labels;
for (int i = 0; i < features->size(); i++) {
tmp_features.push_back((*features)[index[i]]);
tmp_labels.push_back((*labels)[index[i]]);
}
for (int i = 0; i < features->size(); i++) {
for (int j = 0; j < FEATURE_NUM; j++) {
(*features)[i][j] = tmp_features[i][j];
}
(*labels)[i] = tmp_labels[i];
}
return 0;
}
int main(int argc, char* argv[]) {
if (argc < 2) {
cerr << "usage: ./demo_trainer is_small" << endl;
cerr << " if is_small is true, the batch size is set to 1, " << endl;
cerr << " and it will only runs for 10 steps." << endl;
return 1;
}
string is_small = argv[1];
vector<vector<float>> features;
vector<float> labels;
read_samples("housing.data", &features, &labels);
cout << "sample count: " << features.size() << " " << endl;
std::shared_ptr<LRModel> local_model(new LRModel());
local_model->InitModel();
if (is_small == "true") {
cout << "small mode" << endl;
for (int i; i < 10; i++) {
vector<vector<float>> batch_feature;
vector<float> batch_label;
batch_feature.push_back(features[i]);
batch_label.push_back(labels[i]);
auto loss = local_model->Predict(batch_feature, batch_label);
cout << "sample " << i << ": " << loss << endl;
}
} else if (is_small == "false") {
// shuffle
cout << "full model" << endl;
int epoch = 100;
int batch_size = 20;
int step = 0;
for (int i; i < epoch; i++) {
shuffle(&features, &labels);
for (int j = 0;
j < ceil(static_cast<float>(features.size()) / batch_size);
j++) {
int start_idx = j * batch_size;
int end_idx =
min((j + 1) * batch_size, static_cast<int>(features.size()));
auto batch_feature = vector<vector<float>>(features.begin() + start_idx,
features.begin() + end_idx);
auto batch_label =
vector<float>(labels.begin() + start_idx, labels.begin() + end_idx);
auto loss = local_model->Predict(batch_feature, batch_label);
if (step % 10 == 0) {
std::cout << "batch: " << i << ", step: " << step
<< ", Loss: " << loss << endl;
}
step += 1;
}
}
} else {
cerr << "wrong arg for is_small: " << is_small << endl;
}
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <assert.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
using std::string;
using std::vector;
using std::cerr;
using std::cout;
using std::endl;
using std::min;
using std::max;
using std::fstream;
extern int FEATURE_NUM;
int get_samples(string line, const vector<float>& feature, float* label);
int read_samples(const string fname,
vector<vector<float>>* features,
vector<float>* labels);
rm -rf build
mkdir build
cd build
LITE_ROOT=$1
NDK_ROOT=$2
cmake .. \
-DLITE_ROOT=${LITE_ROOT} \
-DNDK_ROOT=${NDK_ROOT} \
-DCMAKE_TOOLCHAIN_FILE=${NDK_ROOT}/build/cmake/android.toolchain.cmake \
-DANDROID_TOOLCHAIN=gcc \
-DANDROID_ABI="armeabi-v7a" \
-DANDROID_PLATFORM=android-23 \
-DANDROID=true \
-DANDROID_STL=c++_static
make
cd ..
# ./bin/demo_trainer
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
import argparse
import math
import numpy
import paddle
import paddle.fluid as fluid
def parse_args():
parser = argparse.ArgumentParser("fit_a_line")
parser.add_argument(
'--save_model',
action='store_true',
help="Whether to save main program")
parser.add_argument(
'--num_steps',
type=int,
default=1000000000000,
help="train steps")
parser.add_argument(
'--num_epochs', type=int, default=100, help="number of epochs.")
parser.add_argument(
'--batch_size', type=int, default=20, help="batch size.")
parser.add_argument(
'--shuffle',
action='store_true',
help="Whether to shuffle train data.")
args = parser.parse_args()
return args
# For training test cost
def train_test(executor, program, reader, feeder, fetch_list):
accumulated = 1 * [0]
count = 0
for data_test in reader():
outs = executor.run(
program=program, feed=feeder.feed(data_test), fetch_list=fetch_list)
accumulated = [x_c[0] + x_c[1][0] for x_c in zip(accumulated, outs)]
count += 1
return [x_d / count for x_d in accumulated]
def main():
if args.shuffle:
print("doing shuffle")
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.train(), buf_size=500),
batch_size=args.batch_size)
else:
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=args.batch_size)
# feature vector of length 13
x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
main_program = fluid.default_main_program()
startup_program = fluid.default_startup_program()
main_program.random_seed = 90
startup_program.random_seed = 90
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_loss = fluid.layers.mean(cost)
test_program = main_program.clone(for_test=True)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
num_epochs = args.num_epochs
# main train loop.
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe.run(startup_program)
if args.save_model:
fluid.io.save_persistables(exe, "model_dir")
# add feed and fetch op
feeded_var_names = ['x', 'y']
fetch_var_names = ['mean_0.tmp_0']
fluid.io.prepend_feed_ops(main_program, feeded_var_names)
fluid.io.append_fetch_ops(main_program, fetch_var_names)
with open("model_dir/__model__", "wb") as f:
f.write(main_program.desc.serialize_to_string())
with open("debug_main_program", "w") as f:
f.write(str(main_program))
print("train model saved to model_dir")
return
train_prompt = "Train cost"
step = 0
for pass_id in range(num_epochs):
for data_train in train_reader():
avg_loss_value, = exe.run(
main_program,
feed=feeder.feed(data_train),
fetch_list=[avg_loss])
print("%s, Step %d, Cost %f" %
(train_prompt, step, avg_loss_value[0]))
if step == args.num_steps - 1:
return
step += 1
if math.isnan(float(avg_loss_value[0])):
sys.exit("got NaN loss, training failed.")
if __name__ == '__main__':
args = parse_args()
main()
...@@ -106,13 +106,12 @@ add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math ...@@ -106,13 +106,12 @@ add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math
# 4. training kernels # 4. training kernels
add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
if(LITE_WITH_TRAIN)
add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
endif()
lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm) lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
......
...@@ -30,6 +30,8 @@ lite_cc_library(subgraph_bridge_conv_transpose_op_bm SRCS conv_transpose_op.cc D ...@@ -30,6 +30,8 @@ lite_cc_library(subgraph_bridge_conv_transpose_op_bm SRCS conv_transpose_op.cc D
lite_cc_library(subgraph_bridge_reduce_full_op_bm SRCS reduce_full_op.cc DEPS ${bm_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_reduce_full_op_bm SRCS reduce_full_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_squeeze_op_bm SRCS squeeze_op.cc DEPS ${bm_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_squeeze_op_bm SRCS squeeze_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_cast_op_bm SRCS cast_op.cc DEPS ${bm_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_cast_op_bm SRCS cast_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_fill_constant_op_bm SRCS fill_constant_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_assign_value_op_bm SRCS assign_value_op.cc DEPS ${bm_subgraph_bridge_deps})
set(bm_subgraph_bridges set(bm_subgraph_bridges
subgraph_bridge_registry subgraph_bridge_registry
...@@ -58,4 +60,6 @@ set(bm_subgraph_bridges ...@@ -58,4 +60,6 @@ set(bm_subgraph_bridges
subgraph_bridge_reduce_full_op_bm subgraph_bridge_reduce_full_op_bm
subgraph_bridge_squeeze_op_bm subgraph_bridge_squeeze_op_bm
subgraph_bridge_cast_op_bm subgraph_bridge_cast_op_bm
subgraph_bridge_fill_constant_op_bm
subgraph_bridge_assign_value_op_bm
CACHE INTERNAL "bm_subgraph_bridges") CACHE INTERNAL "bm_subgraph_bridges")
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include <bmcompiler_if.h> #include <bmcompiler_if.h>
#include <bmcompiler_if_lite.h>
#include <bmcompiler_op_code.h> #include <bmcompiler_op_code.h>
#include "lite/kernels/bm/bridges/graph.h" #include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
...@@ -35,16 +36,14 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -35,16 +36,14 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto output_var_name = op_info->Output("Out").front(); auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>(); auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims(); auto output_dims = output->dims();
const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]); bool x_is_const = !graph->HasNode(x_var_name);
const int64_t* output_shape_data =
const_cast<const int64_t*>(&output_dims.data()[0]);
std::vector<int32_t> i_x_shape_data(x_dims.size()); std::vector<int32_t> i_x_shape_data(x_dims.size());
std::vector<int32_t> i_output_shape_data(output_dims.size()); std::vector<int32_t> i_output_shape_data(output_dims.size());
for (size_t i = 0; i < x_dims.size(); i++) { for (size_t i = 0; i < x_dims.size(); i++) {
i_x_shape_data[i] = static_cast<int>(x_shape_data[i]); i_x_shape_data[i] = x_dims[i];
} }
for (size_t i = 0; i < output_dims.size(); i++) { for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]); i_output_shape_data[i] = output_dims[i];
} }
float alpha = 0.f; float alpha = 0.f;
int active_type_id = 0; int active_type_id = 0;
...@@ -59,6 +58,15 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -59,6 +58,15 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
LOG(FATAL) << "[BM] unsupport act type"; LOG(FATAL) << "[BM] unsupport act type";
return FAILED; return FAILED;
} }
const float* x_data = const_cast<const float*>(x->mutable_data<float>());
if (x_is_const) {
bm_add_const_tensor(graph->GetCompilerHandle(),
static_cast<const char*>(x_var_name.c_str()),
const_cast<const int*>(&i_x_shape_data[0]),
x_dims.size(),
static_cast<bm_data_type_t>(DTYPE_FP32),
static_cast<const void*>(x_data));
}
if (op_type == "relu" || op_type == "leaky_relu") { if (op_type == "relu" || op_type == "leaky_relu") {
add_relu_layer(graph->GetCompilerHandle(), add_relu_layer(graph->GetCompilerHandle(),
const_cast<const int*>(&i_x_shape_data[0]), const_cast<const int*>(&i_x_shape_data[0]),
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_defs.h>
#include <bmcompiler_if.h>
#include <bmcompiler_if_lite.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace bm {
int AssignValueConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto scope = op->scope();
auto op_info = op->op_info();
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims();
std::vector<int32_t> i_output_shape_data(output_dims.size());
int buffer_size = 1;
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_dims[i]);
buffer_size *= i_output_shape_data[i];
}
auto fp32_values = op_info->GetAttr<std::vector<float>>("fp32_values");
float* assign_data =
reinterpret_cast<float*>(malloc(buffer_size * sizeof(float)));
CHECK(assign_data != nullptr);
CHECK_EQ(buffer_size, fp32_values.size());
bm_add_const_tensor(graph->GetCompilerHandle(),
static_cast<const char*>(output_var_name.c_str()),
const_cast<const int*>(i_output_shape_data.data()),
output_dims.size(),
static_cast<bm_data_type_t>(DTYPE_FP32),
reinterpret_cast<const void*>(assign_data));
graph->AddNode(output_var_name);
return SUCCESS;
}
} // namespace bm
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(assign_value,
kBM,
paddle::lite::subgraph::bm::AssignValueConverter);
...@@ -39,6 +39,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -39,6 +39,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto filter_var_name = op_info->Input("Filter").front(); auto filter_var_name = op_info->Input("Filter").front();
auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>(); auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
auto filter_dims = filter->dims(); auto filter_dims = filter->dims();
CHECK_EQ(input_dims.size(), 4); CHECK_EQ(input_dims.size(), 4);
CHECK_EQ(output_dims.size(), 4); CHECK_EQ(output_dims.size(), 4);
CHECK_EQ(filter_dims.size(), 4); CHECK_EQ(filter_dims.size(), 4);
...@@ -90,6 +91,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -90,6 +91,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
dilations[1], dilations[1],
static_cast<int>(has_bias)); static_cast<int>(has_bias));
graph->AddNode(output_var_name); graph->AddNode(output_var_name);
LOG(INFO) << output_var_name << input_dims << " " << output_dims;
return SUCCESS; return SUCCESS;
} }
......
...@@ -65,6 +65,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -65,6 +65,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto output_dims = output->dims(); auto output_dims = output->dims();
const int64_t* output_shape_data = const int64_t* output_shape_data =
const_cast<const int64_t*>(&output_dims.data()[0]); const_cast<const int64_t*>(&output_dims.data()[0]);
LOG(INFO) << x_dims << " " << output_dims;
std::vector<int32_t> i_output_shape_data(output_dims.size()); std::vector<int32_t> i_output_shape_data(output_dims.size());
for (size_t i = 0; i < output_dims.size(); i++) { for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]); i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_defs.h>
#include <bmcompiler_if.h>
#include <bmcompiler_if_lite.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace bm {
int FillConstantConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto scope = op->scope();
auto op_info = op->op_info();
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims();
std::vector<int32_t> i_output_shape_data(output_dims.size());
int buffer_size = 1;
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_dims[i]);
}
float* const_data =
reinterpret_cast<float*>(malloc(buffer_size * sizeof(float)));
CHECK(const_data != nullptr);
auto value = op_info->GetAttr<float>("value");
for (size_t i = 0; i < buffer_size; i++) {
const_data[i] = value;
}
bm_add_const_tensor(graph->GetCompilerHandle(),
static_cast<const char*>(output_var_name.c_str()),
const_cast<const int*>(i_output_shape_data.data()),
output_dims.size(),
static_cast<bm_data_type_t>(DTYPE_FP32),
reinterpret_cast<const void*>(const_data));
graph->AddNode(output_var_name);
return SUCCESS;
}
} // namespace bm
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(fill_constant,
kBM,
paddle::lite::subgraph::bm::FillConstantConverter);
...@@ -29,7 +29,6 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -29,7 +29,6 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto op_info = op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_name = lite::subgraph::bm::UniqueName(op_type); auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
// only support y is const
// input // input
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>(); auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
...@@ -61,6 +60,12 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -61,6 +60,12 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto y_var_name = op_info->Input("Y").front(); auto y_var_name = op_info->Input("Y").front();
auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>(); auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
auto y_dims = y->dims(); auto y_dims = y->dims();
bool y_is_const = !graph->HasNode(y_var_name);
CHECK_EQ(y_dims.size(), 2);
int i_y_shape_data[2];
for (size_t i = 0; i < 2; i++) {
i_y_shape_data[i] = y_dims[i];
}
// output // output
auto output_var_name = op_info->Output("Out").front(); auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>(); auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
...@@ -71,6 +76,7 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -71,6 +76,7 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
for (size_t i = 0; i < output_dims.size(); i++) { for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]); i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
} }
if (y_is_const) {
add_fc_layer(graph->GetCompilerHandle(), add_fc_layer(graph->GetCompilerHandle(),
const_cast<const int*>(&i_x_reshape_shape_data[0]), const_cast<const int*>(&i_x_reshape_shape_data[0]),
2, 2,
...@@ -85,6 +91,24 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -85,6 +91,24 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
nullptr, nullptr,
0, 0,
0); 0);
} else {
add_fc_weight_layer(
graph->GetCompilerHandle(),
const_cast<const int*>(&i_x_reshape_shape_data[0]),
2,
static_cast<const char*>(unique_op_reshape_name.c_str()),
const_cast<const int*>(&i_output_shape_data[0]),
output_dims.size(),
static_cast<const char*>(output_var_name.c_str()),
static_cast<const char*>(unique_op_name.c_str()),
const_cast<const int*>(&i_y_shape_data[0]),
2,
static_cast<const char*>(y_var_name.c_str()),
i_x_reshape_shape_data[1],
nullptr,
0,
0);
}
graph->AddNode(output_var_name); graph->AddNode(output_var_name);
return SUCCESS; return SUCCESS;
} }
......
...@@ -51,3 +51,5 @@ USE_SUBGRAPH_BRIDGE(reduce_mean, kBM); ...@@ -51,3 +51,5 @@ USE_SUBGRAPH_BRIDGE(reduce_mean, kBM);
USE_SUBGRAPH_BRIDGE(squeeze, kBM); USE_SUBGRAPH_BRIDGE(squeeze, kBM);
USE_SUBGRAPH_BRIDGE(squeeze2, kBM); USE_SUBGRAPH_BRIDGE(squeeze2, kBM);
USE_SUBGRAPH_BRIDGE(cast, kBM); USE_SUBGRAPH_BRIDGE(cast, kBM);
USE_SUBGRAPH_BRIDGE(fill_constant, kBM);
USE_SUBGRAPH_BRIDGE(assign_value, kBM);
...@@ -35,7 +35,7 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -35,7 +35,7 @@ int SubgraphEngine::BuildDeviceProgram() {
graph.CreateCompilerHandle(); graph.CreateCompilerHandle();
auto& ctx = this->ctx_->template As<BMContext>(); auto& ctx = this->ctx_->template As<BMContext>();
for (auto& inst : origin_program_) { for (auto& inst : origin_program_) {
auto op = inst.op(); auto op = const_cast<OpLite*>(inst.op());
CHECK(op); CHECK(op);
op->CheckShape(); op->CheckShape();
op->InferShape(); op->InferShape();
......
...@@ -8,6 +8,8 @@ add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_de ...@@ -8,6 +8,8 @@ add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_de
add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps}) add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps}) add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps}) add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
add_kernel(abs_compute_cuda CUDA basic SRCS abs_compute.cu DEPS ${lite_kernel_deps})
add_kernel(tanh_compute_cuda CUDA basic SRCS tanh_compute.cu DEPS ${lite_kernel_deps})
add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps}) add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps}) add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps}) add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
...@@ -45,6 +47,8 @@ lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_ ...@@ -45,6 +47,8 @@ lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_
#nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda) #nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
nv_test(nearest_interp_compute_cuda_test SRCS nearest_interp_compute_test.cc DEPS nearest_interp_compute_cuda) nv_test(nearest_interp_compute_cuda_test SRCS nearest_interp_compute_test.cc DEPS nearest_interp_compute_cuda)
nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_relu_compute_cuda) nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_relu_compute_cuda)
nv_test(abs_compute_cuda_test SRCS abs_compute_test.cc DEPS abs_compute_cuda)
nv_test(tanh_compute_cuda_test SRCS tanh_compute_test.cc DEPS tanh_compute_cuda)
nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda) nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda)
nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda) nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda) nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
...@@ -61,7 +65,7 @@ nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc ...@@ -61,7 +65,7 @@ nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc
#nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda) #nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
#nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda) #nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda) nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
#nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda) #nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda)
#nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda) #nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
if(LITE_BUILD_EXTRA) if(LITE_BUILD_EXTRA)
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/op_registry.h"
#include "lite/kernels/cuda/abs_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T>
__global__ void AbsKernel(const int num, const T* input, T* output);
template <>
__global__ void AbsKernel<float>(const int num,
const float* input,
float* output) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < num) {
output[index] = fabsf(input[index]);
}
}
template <>
__global__ void AbsKernel<double>(const int num,
const double* input,
double* output) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < num) {
output[index] = fabs(input[index]);
}
}
void AbsCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
int num = static_cast<int>(param.X->numel());
auto input = param.X->data<float>();
auto output = param.Out->mutable_data<float>(TARGET(kCUDA));
const int threads = 512;
const int blocks = (num + threads - 1) / threads;
AbsKernel<float><<<blocks, threads, 0, stream>>>(num, input, output);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
abs, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::AbsCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
class AbsCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
void Run() override;
virtual ~AbsCompute() = default;
};
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/cuda/abs_compute.h"
#include <gtest/gtest.h>
#include <cmath>
#include <memory>
#include <utility>
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
TEST(abs, normal) {
AbsCompute abs_kernel;
std::unique_ptr<KernelContext> ctx(new KernelContext);
auto& context = ctx->As<CUDAContext>();
operators::ActivationParam param;
Tensor x, y, x_cpu, y_cpu;
int h = 3, w = 3;
y.Resize({h, w});
x_cpu.Resize({h, w});
y_cpu.Resize({h, w});
auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
float* x_cpu_data = x_cpu.mutable_data<float>();
float* y_cpu_data = y_cpu.mutable_data<float>();
for (int i = 0; i < x_cpu.numel(); i++) {
x_cpu_data[i] = i - 1.5;
}
x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
param.X = &x;
param.Out = &y;
abs_kernel.SetParam(param);
cudaStream_t stream;
cudaStreamCreate(&stream);
context.SetExecStream(stream);
abs_kernel.SetContext(std::move(ctx));
abs_kernel.Launch();
cudaDeviceSynchronize();
CopySync<TARGET(kCUDA)>(
y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
for (int i = 0; i < y.numel(); i++) {
EXPECT_NEAR(y_cpu_data[i], std::fabs(x_cpu_data[i]), 1e-5);
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -152,6 +152,18 @@ void ElementwiseAddComputeNHWC::Run() { ...@@ -152,6 +152,18 @@ void ElementwiseAddComputeNHWC::Run() {
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
} }
void ElementwiseSubCompute::Run() {
ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kSUB, false)
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
}
void ElementwiseSubComputeNHWC::Run() {
ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kSUB, false)
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
}
void ElementwiseMulCompute::Run() { void ElementwiseMulCompute::Run() {
ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, false) ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, false)
cudaError_t error = cudaGetLastError(); cudaError_t error = cudaGetLastError();
...@@ -204,6 +216,17 @@ REGISTER_LITE_KERNEL(elementwise_add, ...@@ -204,6 +216,17 @@ REGISTER_LITE_KERNEL(elementwise_add,
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(elementwise_sub,
kCUDA,
kFloat,
kNCHW,
paddle::lite::kernels::cuda::ElementwiseSubCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_add, REGISTER_LITE_KERNEL(elementwise_add,
kCUDA, kCUDA,
kFloat, kFloat,
...@@ -224,6 +247,26 @@ REGISTER_LITE_KERNEL(elementwise_add, ...@@ -224,6 +247,26 @@ REGISTER_LITE_KERNEL(elementwise_add,
DATALAYOUT(kNHWC))}) DATALAYOUT(kNHWC))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(elementwise_sub,
kCUDA,
kFloat,
kNHWC,
paddle::lite::kernels::cuda::ElementwiseSubComputeNHWC,
nhwc_format)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_mul, REGISTER_LITE_KERNEL(elementwise_mul,
kCUDA, kCUDA,
kFloat, kFloat,
......
...@@ -38,6 +38,24 @@ class ElementwiseAddComputeNHWC ...@@ -38,6 +38,24 @@ class ElementwiseAddComputeNHWC
virtual ~ElementwiseAddComputeNHWC() = default; virtual ~ElementwiseAddComputeNHWC() = default;
}; };
class ElementwiseSubCompute
: public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseParam;
void Run() override;
virtual ~ElementwiseSubCompute() = default;
};
class ElementwiseSubComputeNHWC
: public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::ElementwiseParam;
void Run() override;
virtual ~ElementwiseSubComputeNHWC() = default;
};
class ElementwiseMulCompute class ElementwiseMulCompute
: public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> { : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public: public:
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/op_registry.h"
#include "lite/kernels/cuda/tanh_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T>
__global__ void TanhKernel(const int num, const T* input, T* output) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < num) {
output[index] = tanh(input[index]);
}
}
void TanhCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
int num = static_cast<int>(param.X->numel());
auto input = param.X->data<float>();
auto output = param.Out->mutable_data<float>(TARGET(kCUDA));
const int threads = 512;
const int blocks = (num + threads - 1) / threads;
TanhKernel<float><<<blocks, threads, 0, stream>>>(num, input, output);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
tanh, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::TanhCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cmath>
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
class TanhCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
void Run() override;
virtual ~TanhCompute() = default;
};
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/cuda/tanh_compute.h"
#include <gtest/gtest.h>
#include <cmath>
#include <memory>
#include <utility>
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
TEST(tanh, fp32) {
TanhCompute tanh_kernel;
std::unique_ptr<KernelContext> ctx(new KernelContext);
auto& context = ctx->As<CUDAContext>();
operators::ActivationParam param;
Tensor x, y, x_cpu, y_cpu;
int h = 3, w = 3;
y.Resize({h, w});
x_cpu.Resize({h, w});
y_cpu.Resize({h, w});
auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
float* x_cpu_data = x_cpu.mutable_data<float>();
float* y_cpu_data = y_cpu.mutable_data<float>();
for (int i = 0; i < x_cpu.numel(); i++) {
x_cpu_data[i] = i - 1.5;
}
x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
param.X = &x;
param.Out = &y;
tanh_kernel.SetParam(param);
cudaStream_t stream;
cudaStreamCreate(&stream);
context.SetExecStream(stream);
tanh_kernel.SetContext(std::move(ctx));
tanh_kernel.Launch();
cudaDeviceSynchronize();
CopySync<TARGET(kCUDA)>(
y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
for (int i = 0; i < y.numel(); i++) {
EXPECT_NEAR(y_cpu_data[i], tanh(x_cpu_data[i]), 1e-5);
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
...@@ -5,6 +5,3 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne ...@@ -5,6 +5,3 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne
add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op) add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps}) add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps}) add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
...@@ -92,6 +92,7 @@ T PolyIoU(const T* box1, ...@@ -92,6 +92,7 @@ T PolyIoU(const T* box1,
const size_t box_size, const size_t box_size,
const bool normalized) { const bool normalized) {
LOG(FATAL) << "PolyIoU not implement."; LOG(FATAL) << "PolyIoU not implement.";
return *box1;
} }
template <class T> template <class T>
...@@ -369,6 +370,7 @@ void MulticlassNmsCompute::Run() { ...@@ -369,6 +370,7 @@ void MulticlassNmsCompute::Run() {
} }
} else { } else {
outs->Resize({static_cast<int64_t>(num_kept), out_dim}); outs->Resize({static_cast<int64_t>(num_kept), out_dim});
outs->mutable_data<float>();
int offset = 0; int offset = 0;
int* oindices = nullptr; int* oindices = nullptr;
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/host/multiclass_nms_compute.h"
#include <gtest/gtest.h>
#include <map>
#include <utility>
#include <vector>
namespace paddle {
namespace lite {
namespace kernels {
namespace host {
template <typename dtype>
static bool sort_score_pair_descend(const std::pair<float, dtype>& pair1,
const std::pair<float, dtype>& pair2) {
return pair1.first > pair2.first;
}
template <typename dtype>
void get_max_score_index(const dtype* scores,
int num,
float threshold,
int top_k,
std::vector<std::pair<dtype, int>>* score_index_vec) {
//! Generate index score pairs.
for (int i = 0; i < num; ++i) {
if (scores[i] > threshold) {
score_index_vec->push_back(std::make_pair(scores[i], i));
}
}
//! Sort the score pair according to the scores in descending order
std::stable_sort(score_index_vec->begin(),
score_index_vec->end(),
sort_score_pair_descend<int>);
//! Keep top_k scores if needed.
if (top_k > -1 && top_k < score_index_vec->size()) {
score_index_vec->resize(top_k);
}
}
template <typename dtype>
dtype bbox_size(const dtype* bbox, bool normalized = true) {
if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) {
// If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
return dtype(0.);
} else {
const dtype width = bbox[2] - bbox[0];
const dtype height = bbox[3] - bbox[1];
if (normalized) {
return width * height;
} else {
// If bbox is not within range [0, 1].
return (width + 1) * (height + 1);
}
}
}
template <typename dtype>
dtype jaccard_overlap(const dtype* bbox1, const dtype* bbox2) {
if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] || bbox2[1] > bbox1[3] ||
bbox2[3] < bbox1[1]) {
return dtype(0.);
} else {
const dtype inter_xmin = std::max(bbox1[0], bbox2[0]);
const dtype inter_ymin = std::max(bbox1[1], bbox2[1]);
const dtype inter_xmax = std::min(bbox1[2], bbox2[2]);
const dtype inter_ymax = std::min(bbox1[3], bbox2[3]);
const dtype inter_width = inter_xmax - inter_xmin;
const dtype inter_height = inter_ymax - inter_ymin;
const dtype inter_size = inter_width * inter_height;
const dtype bbox1_size = bbox_size(bbox1);
const dtype bbox2_size = bbox_size(bbox2);
return inter_size / (bbox1_size + bbox2_size - inter_size);
}
}
template <typename dtype>
void apply_nms_fast(const dtype* bboxes,
const dtype* scores,
int num,
float score_threshold,
float nms_threshold,
float eta,
int top_k,
std::vector<int>* indices) {
// Get top_k scores (with corresponding indices).
std::vector<std::pair<dtype, int>> score_index_vec;
get_max_score_index(scores, num, score_threshold, top_k, &score_index_vec);
// Do nms.
float adaptive_threshold = nms_threshold;
indices->clear();
while (score_index_vec.size() != 0) {
const int idx = score_index_vec.front().second;
bool keep = true;
for (int k = 0; k < indices->size(); ++k) {
if (keep) {
const int kept_idx = (*indices)[k];
float overlap =
jaccard_overlap(bboxes + idx * 4, bboxes + kept_idx * 4);
keep = overlap <= adaptive_threshold;
} else {
break;
}
}
if (keep) {
indices->push_back(idx);
}
score_index_vec.erase(score_index_vec.begin());
if (keep && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta;
}
}
}
template <typename dtype>
void multiclass_nms_compute_ref(const operators::MulticlassNmsParam& param,
int class_num,
const std::vector<int>& priors,
bool share_location,
std::vector<float>* result) {
int background_id = param.background_label;
int keep_topk = param.keep_top_k;
int nms_topk = param.nms_top_k;
float conf_thresh = param.score_threshold;
float nms_thresh = param.nms_threshold;
float nms_eta = param.nms_eta;
const dtype* bbox_data = param.bboxes->data<const dtype>();
const dtype* conf_data = param.scores->data<const dtype>();
dtype* out = param.out->mutable_data<dtype>();
(*result).clear();
int num_kept = 0;
std::vector<std::map<int, std::vector<int>>> all_indices;
int64_t conf_offset = 0;
int64_t bbox_offset = 0;
for (int i = 0; i < priors.size(); ++i) {
std::map<int, std::vector<int>> indices;
int num_det = 0;
int num_priors = priors[i];
int conf_idx = class_num * conf_offset;
int bbox_idx =
share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
for (int c = 0; c < class_num; ++c) {
if (c == background_id) {
// Ignore background class
continue;
}
const dtype* cur_conf_data = conf_data + conf_idx + c * num_priors;
const dtype* cur_bbox_data = bbox_data + bbox_idx;
if (!share_location) {
cur_bbox_data += c * num_priors * 4;
}
apply_nms_fast(cur_bbox_data,
cur_conf_data,
num_priors,
conf_thresh,
nms_thresh,
nms_eta,
nms_topk,
&(indices[c]));
num_det += indices[c].size();
}
if (keep_topk > -1 && num_det > keep_topk) {
std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
for (auto it = indices.begin(); it != indices.end(); ++it) {
int label = it->first;
const std::vector<int>& label_indices = it->second;
for (int j = 0; j < label_indices.size(); ++j) {
int idx = label_indices[j];
float score = conf_data[conf_idx + label * num_priors + idx];
score_index_pairs.push_back(
std::make_pair(score, std::make_pair(label, idx)));
}
}
// Keep top k results per image.
std::stable_sort(score_index_pairs.begin(),
score_index_pairs.end(),
sort_score_pair_descend<std::pair<int, int>>);
score_index_pairs.resize(keep_topk);
// Store the new indices.
std::map<int, std::vector<int>> new_indices;
for (int j = 0; j < score_index_pairs.size(); ++j) {
int label = score_index_pairs[j].second.first;
int idx = score_index_pairs[j].second.second;
new_indices[label].push_back(idx);
}
all_indices.push_back(new_indices);
num_kept += keep_topk;
} else {
all_indices.push_back(indices);
num_kept += num_det;
}
conf_offset += num_priors;
bbox_offset += num_priors;
}
if (num_kept == 0) {
(*result).clear();
(*result).resize(1);
(*result)[0] = -1;
return;
} else {
(*result).resize(num_kept * 6);
}
int count = 0;
conf_offset = 0;
bbox_offset = 0;
for (int i = 0; i < priors.size(); ++i) {
int num_priors = priors[i];
int conf_idx = class_num * conf_offset;
int bbox_idx =
share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
for (auto it = all_indices[i].begin(); it != all_indices[i].end(); ++it) {
int label = it->first;
std::vector<int>& indices = it->second;
const dtype* cur_conf_data = conf_data + conf_idx + label * num_priors;
const dtype* cur_bbox_data = bbox_data + bbox_idx;
if (!share_location) {
cur_bbox_data += label * num_priors * 4;
}
for (int j = 0; j < indices.size(); ++j) {
int idx = indices[j];
(*result)[count * 6] = label;
(*result)[count * 6 + 1] = cur_conf_data[idx];
for (int k = 0; k < 4; ++k) {
(*result)[count * 6 + 2 + k] = cur_bbox_data[idx * 4 + k];
}
++count;
}
}
conf_offset += num_priors;
bbox_offset += num_priors;
}
}
TEST(multiclass_nms_host, init) {
MulticlassNmsCompute multiclass_nms;
ASSERT_EQ(multiclass_nms.precision(), PRECISION(kFloat));
ASSERT_EQ(multiclass_nms.target(), TARGET(kHost));
}
TEST(multiclass_nms_host, retrive_op) {
auto multiclass_nms =
KernelRegistry::Global().Create<TARGET(kHost), PRECISION(kFloat)>(
"multiclass_nms");
ASSERT_FALSE(multiclass_nms.empty());
ASSERT_TRUE(multiclass_nms.front());
}
TEST(multiclass_nms_host, compute) {
MulticlassNmsCompute multiclass_nms;
operators::MulticlassNmsParam param;
lite::Tensor bbox, conf, out;
std::vector<float> out_ref;
for (std::vector<int> priors : {std::vector<int>({2, 2, 2})}) {
int N = priors.size();
for (bool share_location : {true}) {
for (int class_num : {1, 4, 10}) {
DDim* bbox_dim;
DDim* conf_dim;
int M = priors[0];
if (share_location) {
bbox_dim = new DDim({N, M, 4});
} else {
bbox_dim = new DDim({class_num, M, 4});
}
conf_dim = new DDim({N, class_num, M});
bbox.Resize(*bbox_dim);
conf.Resize(*conf_dim);
for (int background_id : {0}) {
for (int keep_topk : {1, 5, 10}) {
for (int nms_topk : {1, 5, 10}) {
for (float nms_eta : {1.0, 0.99, 0.9}) {
for (float nms_thresh : {0.5, 0.7}) {
for (float conf_thresh : {0.5, 0.7}) {
auto* conf_data = conf.mutable_data<float>();
auto* bbox_data = bbox.mutable_data<float>();
for (int i = 0; i < bbox_dim->production(); ++i) {
bbox_data[i] = i * 1. / bbox_dim->production();
}
for (int i = 0; i < conf_dim->production(); ++i) {
conf_data[i] = i * 1. / conf_dim->production();
}
param.bboxes = &bbox;
param.scores = &conf;
param.out = &out;
param.background_label = background_id;
param.keep_top_k = keep_topk;
param.nms_top_k = nms_topk;
param.score_threshold = conf_thresh;
param.nms_threshold = nms_thresh;
param.nms_eta = nms_eta;
multiclass_nms.SetParam(param);
multiclass_nms.Run();
auto* out_data = out.mutable_data<float>();
out_ref.clear();
multiclass_nms_compute_ref<float>(
param, class_num, priors, share_location, &out_ref);
EXPECT_EQ(out.dims().production(), out_ref.size());
if (out.dims().production() == out_ref.size()) {
auto* out_ref_data = out_ref.data();
for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
}
}
}
}
}
}
}
}
delete bbox_dim;
delete conf_dim;
}
}
}
}
} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
...@@ -32,8 +32,10 @@ class ReluCompute ...@@ -32,8 +32,10 @@ class ReluCompute
std::string doc() const override { return "Relu using cl::Buffer, kFloat"; } std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "buffer/relu_kernel.cl", build_options_); "buffer/relu_kernel.cl",
build_options_,
time_stamp_);
} }
void Run() override { void Run() override {
...@@ -46,7 +48,7 @@ class ReluCompute ...@@ -46,7 +48,7 @@ class ReluCompute
auto* x_buf = param.X->data<float, cl::Buffer>(); auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target()); VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target()); VLOG(4) << TargetToStr(param.Out->target());
...@@ -74,6 +76,7 @@ class ReluCompute ...@@ -74,6 +76,7 @@ class ReluCompute
private: private:
std::string kernel_func_name_{"relu"}; std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_float -DRELU"}; std::string build_options_{"-DCL_DTYPE_float -DRELU"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
...@@ -87,8 +90,10 @@ class SigmoidCompute ...@@ -87,8 +90,10 @@ class SigmoidCompute
} }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_); "buffer/sigmoid_kernel.cl",
build_options_,
time_stamp_);
} }
void Run() override { void Run() override {
...@@ -101,7 +106,7 @@ class SigmoidCompute ...@@ -101,7 +106,7 @@ class SigmoidCompute
auto* x_buf = param.X->data<float, cl::Buffer>(); auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target()); VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target()); VLOG(4) << TargetToStr(param.Out->target());
...@@ -129,6 +134,7 @@ class SigmoidCompute ...@@ -129,6 +134,7 @@ class SigmoidCompute
private: private:
std::string kernel_func_name_{"sigmoid"}; std::string kernel_func_name_{"sigmoid"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"}; std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -37,11 +37,12 @@ class ActivationComputeImageDefault ...@@ -37,11 +37,12 @@ class ActivationComputeImageDefault
} }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
act_param_ = param_.get_mutable<param_t>(); act_param_ = param_.get_mutable<param_t>();
int act_type = static_cast<int>(act_param_->active_type); int act_type = static_cast<int>(act_param_->active_type);
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "ActivationTypeToStr(act_param_->active_type):" VLOG(1) << "ActivationTypeToStr(act_param_->active_type):"
<< ActivationTypeToStr(act_param_->active_type); << ActivationTypeToStr(act_param_->active_type);
#endif
switch (act_type) { switch (act_type) {
case 1: case 1:
kernel_func_name_ = "relu"; kernel_func_name_ = "relu";
...@@ -71,41 +72,70 @@ class ActivationComputeImageDefault ...@@ -71,41 +72,70 @@ class ActivationComputeImageDefault
LOG(FATAL) << "This act type:" << act_type << " doesn't support."; LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
return; return;
} }
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel( #endif
kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_img = param.X->data<half_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); context.cl_context()->AddKernel(kernel_func_name_,
"image/activation_kernel.cl",
build_options_,
time_stamp_);
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
int arg_idx = 0; void ReInitWhenNeeded() override {
cl_int status = kernel.setArg(arg_idx, *x_img); act_param_ = param_.get_mutable<param_t>();
auto x_dims = act_param_->X->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
x_img_shape_ = default_convertor.InitImageDimInfoWith(
act_param_->X->dims()); // w, h
out_img_shape_ = default_convertor.InitImageDimInfoWith(
act_param_->Out->dims()); // w, h
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
static_cast<cl::size_type>(x_img_shape_[1])};
}
void Run() override {
auto* x_img = act_param_->X->data<half_t, cl::Image2D>();
auto* out_img = act_param_->Out->mutable_data<half_t, cl::Image2D>(
out_img_shape_[0], out_img_shape_[1]);
auto kernel = kernel_;
cl_int status;
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img); status = kernel.setArg(1, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold_); status = kernel.setArg(2, threshold_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, scale_); status = kernel.setArg(3, scale_);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(param.X->target()); const auto& x_dims = act_param_->X->dims();
VLOG(4) << TargetToStr(param.Out->target()); const auto& y_dims = act_param_->Out->dims(); // useless: check dim only
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " " VLOG(4) << TargetToStr(act_param_->X->target());
<< image_shape["height"]; VLOG(4) << TargetToStr(act_param_->Out->target());
VLOG(4) << "x_img_shape_(w,h):" << x_img_shape_[0] << " "
<< x_img_shape_[1];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " " VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3]; << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " " VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
...@@ -115,13 +145,12 @@ class ActivationComputeImageDefault ...@@ -115,13 +145,12 @@ class ActivationComputeImageDefault
VLOG(4) << "kernel func name:" << kernel_func_name_; VLOG(4) << "kernel func name:" << kernel_func_name_;
#endif #endif
auto global_work_size = auto& context = ctx_->As<OpenCLContext>();
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]), CHECK(context.cl_context() != nullptr);
static_cast<cl::size_type>(image_shape["height"])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
...@@ -131,10 +160,20 @@ class ActivationComputeImageDefault ...@@ -131,10 +160,20 @@ class ActivationComputeImageDefault
private: private:
param_t* act_param_{nullptr}; param_t* act_param_{nullptr};
DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim last_x_dims_;
std::string kernel_func_name_{}; std::string kernel_func_name_{};
float threshold_{6.f}; float threshold_{6.f};
float scale_{1.f}; float scale_{1.f};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
} // namespace opencl } // namespace opencl
......
...@@ -43,8 +43,10 @@ class BilinearInterpImageCompute ...@@ -43,8 +43,10 @@ class BilinearInterpImageCompute
bilinear_interp_param_ = param_.get_mutable<param_t>(); bilinear_interp_param_ = param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/bilinear_interp_kernel.cl", build_options_); "image/bilinear_interp_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
} }
...@@ -103,7 +105,7 @@ class BilinearInterpImageCompute ...@@ -103,7 +105,7 @@ class BilinearInterpImageCompute
#endif #endif
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; int arg_idx = 0;
...@@ -159,6 +161,7 @@ class BilinearInterpImageCompute ...@@ -159,6 +161,7 @@ class BilinearInterpImageCompute
param_t* bilinear_interp_param_{nullptr}; param_t* bilinear_interp_param_{nullptr};
std::string kernel_func_name_{"bilinear_interp"}; std::string kernel_func_name_{"bilinear_interp"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -38,8 +38,10 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL), ...@@ -38,8 +38,10 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
} else { } else {
kernel_func_name_ = "concat_mul"; kernel_func_name_ = "concat_mul";
} }
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "buffer/concat_kernel.cl", build_options_); "buffer/concat_kernel.cl",
build_options_,
time_stamp_);
auto axis = concat_param_->axis; auto axis = concat_param_->axis;
auto inputs = concat_param_->x; auto inputs = concat_param_->x;
...@@ -88,7 +90,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL), ...@@ -88,7 +90,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto inputs = param.x; auto inputs = param.x;
int arg_idx = 0; int arg_idx = 0;
...@@ -177,6 +179,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL), ...@@ -177,6 +179,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
param_t* concat_param_{nullptr}; param_t* concat_param_{nullptr};
std::string kernel_func_name_{}; std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_float"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -40,8 +40,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -40,8 +40,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
kernel_func_name_ = "concat_mul"; kernel_func_name_ = "concat_mul";
} }
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/concat_kernel.cl", build_options_); "image/concat_kernel.cl",
build_options_,
time_stamp_);
auto axis = concat_param_->axis; auto axis = concat_param_->axis;
auto inputs = concat_param_->x; auto inputs = concat_param_->x;
...@@ -117,7 +119,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -117,7 +119,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto inputs = param.x; auto inputs = param.x;
int arg_idx = 0; int arg_idx = 0;
...@@ -251,6 +253,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL), ...@@ -251,6 +253,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
param_t* concat_param_{nullptr}; param_t* concat_param_{nullptr};
std::string kernel_func_name_{}; std::string kernel_func_name_{};
std::string build_options_{" -DCL_DTYPE_half"}; std::string build_options_{" -DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -114,8 +114,10 @@ void ConvCompute::PrepareForRun() { ...@@ -114,8 +114,10 @@ void ConvCompute::PrepareForRun() {
} }
for (size_t i = 0; i < kernel_func_names_.size(); i++) { for (size_t i = 0; i < kernel_func_names_.size(); i++) {
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_names_[i],
kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]); kernel_func_paths_[i],
build_options_[i],
time_stamp_);
} }
} }
...@@ -153,7 +155,7 @@ void ConvCompute::GemmlikeConv2d() { ...@@ -153,7 +155,7 @@ void ConvCompute::GemmlikeConv2d() {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
std::stringstream kernel_key; std::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0]; kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
auto img2col_kernel = context.cl_context()->GetKernel(kernel_key.str()); auto img2col_kernel = context.cl_context()->GetKernel(kernel_key.str());
int n_threads = c_in * h_out * w_out; int n_threads = c_in * h_out * w_out;
...@@ -218,7 +220,7 @@ void ConvCompute::GemmlikeConv2d() { ...@@ -218,7 +220,7 @@ void ConvCompute::GemmlikeConv2d() {
int n = h_out * w_out; int n = h_out * w_out;
VLOG(4) << "m = " << m << " n = " << n << " k = " << k; VLOG(4) << "m = " << m << " n = " << n << " k = " << k;
kernel_key.str(""); kernel_key.str("");
kernel_key << kernel_func_names_[1] << build_options_[1]; kernel_key << kernel_func_names_[1] << build_options_[1] << time_stamp_;
auto gemm_kernel = context.cl_context()->GetKernel(kernel_key.str()); auto gemm_kernel = context.cl_context()->GetKernel(kernel_key.str());
GemmBatched( GemmBatched(
gemm_kernel, col_buf, filter_buf, bias_buf, output_buf, bs, m, n, k); gemm_kernel, col_buf, filter_buf, bias_buf, output_buf, bs, m, n, k);
...@@ -249,7 +251,8 @@ void ConvCompute::Conv2d1x1() { ...@@ -249,7 +251,8 @@ void ConvCompute::Conv2d1x1() {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
std::stringstream kernel_key; std::stringstream kernel_key;
kernel_key << kernel_func_names_.front() << build_options_.front(); kernel_key << kernel_func_names_.front() << build_options_.front()
<< time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k); GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
namespace paddle { namespace paddle {
...@@ -55,6 +56,7 @@ class ConvCompute ...@@ -55,6 +56,7 @@ class ConvCompute
std::vector<std::string> kernel_func_names_{}; std::vector<std::string> kernel_func_names_{};
std::vector<std::string> kernel_func_paths_{}; std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{}; std::vector<std::string> build_options_{};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -369,15 +369,17 @@ void ConvImageCompute::PrepareForRun() { ...@@ -369,15 +369,17 @@ void ConvImageCompute::PrepareForRun() {
build_options_.push_back(build_options_single); build_options_.push_back(build_options_single);
for (size_t i = 0; i < kernel_func_names_.size(); i++) { for (size_t i = 0; i < kernel_func_names_.size(); i++) {
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_names_[i],
kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]); kernel_func_paths_[i],
build_options_[i],
time_stamp_);
} }
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}"; << global_work_size_[1] << "," << global_work_size_[2] << "}";
std::stringstream kernel_key; std::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0]; kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str()); kernel_ = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str(); VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str(); VLOG(4) << "kernel ready ... " << kernel_key.str();
...@@ -388,18 +390,43 @@ void ConvImageCompute::PrepareForRun() { ...@@ -388,18 +390,43 @@ void ConvImageCompute::PrepareForRun() {
VLOG(4) << "max_work_group_size: " << max_work_group_size; VLOG(4) << "max_work_group_size: " << max_work_group_size;
if (max_work_group_size > 0 && use_lws) { if (max_work_group_size > 0 && use_lws_) {
// local_work_size_ = context.cl_context()->LocalWorkSizeConv1x1( double min_turn_time = DBL_MAX;
// global_work_size_, max_work_group_size); cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
local_work_size_ = context.cl_context()->LocalWorkSize(global_work_size_, global_work_size_, max_work_group_size);
max_work_group_size); cl::NDRange last_local_work_size = cl::NDRange{
static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
if (use_turn_) {
for (size_t i = 1; i < 15; i++) {
if (kernel_h == 1 && kernel_w == 1) {
// todo use diff logics
local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
global_work_size_, max_work_group_size, i);
} else {
local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
global_work_size_, max_work_group_size, i);
}
if (last_local_work_size[0] == local_work_size_[0] &&
last_local_work_size[1] == local_work_size_[1] &&
last_local_work_size[2] == local_work_size_[2]) {
// skiped turned lws
continue;
}
auto turn_time = this->Turn(5);
if (min_turn_time > turn_time) {
min_turn_time = turn_time;
best_local_work_size = local_work_size_;
}
last_local_work_size = local_work_size_;
}
}
local_work_size_ = best_local_work_size;
VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << "," VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
<< local_work_size_[1] << "," << local_work_size_[2] << "}"; << local_work_size_[1] << "," << local_work_size_[2] << "}";
} }
} }
void ConvImageCompute::Conv2d1x1opt() { void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -431,23 +458,6 @@ void ConvImageCompute::Conv2d1x1opt() { ...@@ -431,23 +458,6 @@ void ConvImageCompute::Conv2d1x1opt() {
int input_c = input_dims[1]; int input_c = input_dims[1];
auto dilations = *param.dilations; auto dilations = *param.dilations;
// const std::vector<size_t>& default_work_size =
// DefaultWorkSize(output_dims,
// DDim(std::vector<DDim::value_type>{
// static_cast<int64_t>(out_image_shape["width"]),
// static_cast<int64_t>(out_image_shape["height"])}));
// int c_block = default_work_size[0];
// int w = default_work_size[1];
// int nh = default_work_size[2];
// int maped_w = maptofactor(w, 4);
// auto global_work_size_ =
// cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
// static_cast<size_t>(maped_w),
// static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image; // VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << "," VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
...@@ -541,73 +551,12 @@ void ConvImageCompute::Conv2d1x1opt() { ...@@ -541,73 +551,12 @@ void ConvImageCompute::Conv2d1x1opt() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
#ifdef PROFILE_CONV_KERNEL
bool use_profile = false;
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
double start = GetCurrentUS();
if (use_profile) {
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
} else {
int count = 50;
double sumtime = 0;
if (!use_profile) {
count = 1;
}
for (size_t i = 0; i < count; i++) {
start = GetCurrentUS();
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
if (use_profile) {
event_->wait(); event_->wait();
double duration = GetCurrentUS() - start;
sumtime += duration;
}
}
auto dims_string = [](DDimLite dims) -> std::string {
std::ostringstream stream;
stream << "[" << dims[0] << "," << dims[1] << "," << dims[2] << ","
<< dims[3] << "]";
return stream.str();
};
if (use_profile) {
// LOG(INFO) << "input: " << input_dims;
// LOG(INFO) << "filter: " << filter_dims;
// LOG(INFO) << "output: " << output_dims;
std::cout << std::setw(25) << std::left << dims_string(input_dims)
<< std::setw(25) << std::left << dims_string(filter_dims)
<< std::setw(25) << std::left << dims_string(output_dims)
<< std::setw(25) << std::left << sumtime / count << std::endl;
} else {
dims_string(input_dims);
}
} }
#endif
} }
void ConvImageCompute::Conv2d3x3() { void ConvImageCompute::Conv2d3x3(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -767,9 +716,13 @@ void ConvImageCompute::Conv2d3x3() { ...@@ -767,9 +716,13 @@ void ConvImageCompute::Conv2d3x3() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::Conv2d3x3opt() { void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -890,9 +843,12 @@ void ConvImageCompute::Conv2d3x3opt() { ...@@ -890,9 +843,12 @@ void ConvImageCompute::Conv2d3x3opt() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::Conv2d5x5() { void ConvImageCompute::Conv2d5x5(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1018,9 +974,12 @@ void ConvImageCompute::Conv2d5x5() { ...@@ -1018,9 +974,12 @@ void ConvImageCompute::Conv2d5x5() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::Conv2d5x5opt() { void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1134,9 +1093,12 @@ void ConvImageCompute::Conv2d5x5opt() { ...@@ -1134,9 +1093,12 @@ void ConvImageCompute::Conv2d5x5opt() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::Conv2d7x7() { void ConvImageCompute::Conv2d7x7(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1262,8 +1224,12 @@ void ConvImageCompute::Conv2d7x7() { ...@@ -1262,8 +1224,12 @@ void ConvImageCompute::Conv2d7x7() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::Conv2d7x7opt() { void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1374,8 +1340,12 @@ void ConvImageCompute::Conv2d7x7opt() { ...@@ -1374,8 +1340,12 @@ void ConvImageCompute::Conv2d7x7opt() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::DepthwiseConv2d3x3s1() { void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1454,9 +1424,13 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() { ...@@ -1454,9 +1424,13 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(output_img, event_); context.cl_wait_list()->emplace(output_img, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::DepthwiseConv2d3x3() { void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1548,9 +1522,13 @@ void ConvImageCompute::DepthwiseConv2d3x3() { ...@@ -1548,9 +1522,13 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(output_img, event_); context.cl_wait_list()->emplace(output_img, event_);
if (is_turn) {
event_->wait();
}
} }
void ConvImageCompute::DepthwiseConv2d() { void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
...@@ -1683,8 +1661,22 @@ void ConvImageCompute::DepthwiseConv2d() { ...@@ -1683,8 +1661,22 @@ void ConvImageCompute::DepthwiseConv2d() {
context.cl_wait_list()->emplace(out_image, event_); context.cl_wait_list()->emplace(out_image, event_);
} }
void ConvImageCompute::Run() { (this->*impl_)(); } void ConvImageCompute::Run() { (this->*impl_)(false); }
#undef PROFILE_CONV_KERNEL
double ConvImageCompute::Turn(int times) {
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
auto start = GetCurrentUS();
for (size_t i = 0; i < times; i++) {
(this->*impl_)(true);
}
auto time_diff = (GetCurrentUS() - start) / times;
return time_diff;
}
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -22,40 +22,42 @@ ...@@ -22,40 +22,42 @@
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
class ConvImageCompute : public KernelLite<TARGET(kOpenCL), class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16), PRECISION(kFP16),
DATALAYOUT(kImageDefault)> { DATALAYOUT(kImageDefault)> {
public: public:
using param_t = operators::ConvParam; using param_t = operators::ConvParam;
using kernel_t = void (ConvImageCompute::*)(); using kernel_t = void (ConvImageCompute::*)(bool);
void PrepareForRun() override; void PrepareForRun() override;
void Run() override; void Run() override;
double Turn(int times = 5);
private: private:
void Conv2d1x1opt(); void Conv2d1x1opt(bool is_turn = false);
void Conv2d3x3(); void Conv2d3x3(bool is_turn = false);
void Conv2d3x3opt(); void Conv2d3x3opt(bool is_turn = false);
void Conv2d5x5(); void Conv2d5x5(bool is_turn = false);
void Conv2d5x5opt(); void Conv2d5x5opt(bool is_turn = false);
void Conv2d7x7(); void Conv2d7x7(bool is_turn = false);
void Conv2d7x7opt(); void Conv2d7x7opt(bool is_turn = false);
void DepthwiseConv2d3x3s1(); void DepthwiseConv2d3x3s1(bool is_turn = false);
void DepthwiseConv2d3x3(); void DepthwiseConv2d3x3(bool is_turn = false);
void DepthwiseConv2d(); void DepthwiseConv2d(bool is_turn = false);
kernel_t impl_; kernel_t impl_;
std::vector<std::string> kernel_func_names_{}; std::vector<std::string> kernel_func_names_{};
std::vector<std::string> kernel_func_paths_{}; std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{}; std::vector<std::string> build_options_{};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_; Tensor filter_gpu_image_;
Tensor bias_gpu_image_; Tensor bias_gpu_image_;
...@@ -72,7 +74,8 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -72,7 +74,8 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
cl::Kernel kernel_; cl::Kernel kernel_;
cl::NDRange local_work_size_ = cl::NDRange{ cl::NDRange local_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)}; static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
bool use_lws{true}; bool use_lws_{true};
bool use_turn_{false};
}; };
} // namespace opencl } // namespace opencl
......
...@@ -44,8 +44,10 @@ class DepthwiseConv2dCompute ...@@ -44,8 +44,10 @@ class DepthwiseConv2dCompute
build_options_ += " -DRELU6"; build_options_ += " -DRELU6";
} }
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "buffer/depthwise_conv2d_kernel.cl", build_options_); "buffer/depthwise_conv2d_kernel.cl",
build_options_,
time_stamp_);
} }
void Run() override { void Run() override {
...@@ -67,7 +69,7 @@ class DepthwiseConv2dCompute ...@@ -67,7 +69,7 @@ class DepthwiseConv2dCompute
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status; cl_int status;
...@@ -120,6 +122,7 @@ class DepthwiseConv2dCompute ...@@ -120,6 +122,7 @@ class DepthwiseConv2dCompute
private: private:
std::string kernel_func_name_{"depthwise_conv2d"}; std::string kernel_func_name_{"depthwise_conv2d"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_float"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -40,8 +40,10 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -40,8 +40,10 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/dropout_kernel.cl", build_options_); "image/dropout_kernel.cl",
build_options_,
time_stamp_);
} }
void Run() override { void Run() override {
...@@ -63,7 +65,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -63,7 +65,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status; cl_int status;
...@@ -101,6 +103,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -101,6 +103,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
private: private:
std::string kernel_func_name_{"dropout"}; std::string kernel_func_name_{"dropout"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -25,8 +25,10 @@ namespace opencl { ...@@ -25,8 +25,10 @@ namespace opencl {
void ElementwiseAddCompute::PrepareForRun() { void ElementwiseAddCompute::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_); "buffer/elementwise_add_kernel.cl",
build_options_,
time_stamp_);
ele_param_ = param_.get_mutable<param_t>(); ele_param_ = param_.get_mutable<param_t>();
UpdateParams(); UpdateParams();
} }
...@@ -39,7 +41,7 @@ void ElementwiseAddCompute::Run() { ...@@ -39,7 +41,7 @@ void ElementwiseAddCompute::Run() {
auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>( auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
TARGET(kOpenCL)); TARGET(kOpenCL));
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(ele_param_->X->target()); VLOG(4) << TargetToStr(ele_param_->X->target());
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
...@@ -46,6 +47,7 @@ class ElementwiseAddCompute ...@@ -46,6 +47,7 @@ class ElementwiseAddCompute
param_t* ele_param_{nullptr}; param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_add"}; std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_float"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -23,10 +23,20 @@ namespace lite { ...@@ -23,10 +23,20 @@ namespace lite {
namespace kernels { namespace kernels {
namespace opencl { namespace opencl {
void ElementwiseAddImageCompute::PrepareForRun() { void ElementwiseAddImageCompute::PrepareForRun() {}
void ElementwiseAddImageCompute::ReInitWhenNeeded() {
ele_param_ = param_.get_mutable<param_t>(); ele_param_ = param_.get_mutable<param_t>();
auto x_dims = ele_param_->X->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// choose kernel
auto* x = ele_param_->X; auto* x = ele_param_->X;
auto* y = ele_param_->Y; auto* y = ele_param_->Y;
auto* out = ele_param_->Out;
auto axis = ele_param_->axis; auto axis = ele_param_->axis;
if (y->dims().size() == 4) { if (y->dims().size() == 4) {
...@@ -49,18 +59,48 @@ void ElementwiseAddImageCompute::PrepareForRun() { ...@@ -49,18 +59,48 @@ void ElementwiseAddImageCompute::PrepareForRun() {
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_); "image/elementwise_add_kernel.cl",
build_options_,
time_stamp_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
x_img_shape_ = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
y_img_shape_ = default_convertor.InitImageDimInfoWith(y->dims());
out_img_shape_ =
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
// compute global work size
GetGlobalWorkSize();
}
} }
void ElementwiseAddImageCompute::Run() { void ElementwiseAddImageCompute::GetGlobalWorkSize() {
auto& context = ctx_->As<OpenCLContext>(); global_work_size_ = cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
CHECK(context.cl_context() != nullptr); static_cast<cl::size_type>(x_img_shape_[1])};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << x_img_shape_[0] << " "
<< x_img_shape_[1];
#endif
}
void ElementwiseAddImageCompute::Run() {
auto* x = ele_param_->X; auto* x = ele_param_->X;
auto* y = ele_param_->Y; auto* y = ele_param_->Y;
auto* out = ele_param_->Out; auto* out = ele_param_->Out;
auto axis = ele_param_->axis; auto axis = ele_param_->axis;
auto x_dims = x->dims();
auto y_dims = y->dims();
auto* x_img = x->data<half_t, cl::Image2D>();
auto* y_img = y->data<half_t, cl::Image2D>();
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
out_img_shape_[1]);
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "x->target():" << TargetToStr(x->target());
...@@ -70,75 +110,53 @@ void ElementwiseAddImageCompute::Run() { ...@@ -70,75 +110,53 @@ void ElementwiseAddImageCompute::Run() {
VLOG(4) << "y->dims():" << y->dims(); VLOG(4) << "y->dims():" << y->dims();
VLOG(4) << "out->dims():" << out->dims(); VLOG(4) << "out->dims():" << out->dims();
VLOG(4) << "axis:" << axis; VLOG(4) << "axis:" << axis;
#endif
paddle::lite::CLImageConverterDefault default_convertor;
auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
auto x_img_width = x_img_shape[0];
auto x_img_height = x_img_shape[1];
auto out_img_shape =
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
auto* x_img = x->data<half_t, cl::Image2D>(); VLOG(4) << "x_img_shape_[w,h]:" << x_img_shape_[0] << " " << x_img_shape_[1];
auto* y_img = y->data<half_t, cl::Image2D>(); VLOG(4) << "y_img_shape_[w,h]:" << y_img_shape_[0] << " " << y_img_shape_[1];
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0], VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
out_img_shape[1]); << out_img_shape_[1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
<< out_img_shape[1];
#endif #endif
STL::stringstream kernel_key; cl_int status;
kernel_key << kernel_func_name_ << build_options_; auto kernel = kernel_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
auto y_dims = y->dims();
if (y_dims.size() == 4) { if (y_dims.size() == 4) {
cl_int status = kernel.setArg(arg_idx, *x_img); status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img); status = kernel.setArg(1, *y_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img); status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} else if (y_dims.size() == 1) { } else if (y_dims.size() == 1) {
if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) { if (axis == x_dims.size() - 1 || axis == x_dims.size() - 3) {
int tensor_w = x->dims()[x->dims().size() - 1]; const int tensor_w = x_dims[x_dims.size() - 1];
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "tensor_w:" << tensor_w; VLOG(4) << "tensor_w:" << tensor_w;
#endif #endif
cl_int status = kernel.setArg(arg_idx, *x_img); status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img); status = kernel.setArg(1, *y_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img); status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w)); status = kernel.setArg(3, tensor_w);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
} else { } else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size() << ", x->dims().size():" << x_dims.size()
<< ", y->dims.size():" << y->dims().size(); << ", y->dims.size():" << y_dims.size();
} }
} else { } else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size() << ", x->dims().size():" << x_dims.size()
<< ", y->dims.size():" << y->dims().size(); << ", y->dims.size():" << y_dims.size();
} }
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width), auto& context = ctx_->As<OpenCLContext>();
static_cast<cl::size_type>(x_img_height)}; CHECK(context.cl_context() != nullptr);
#ifndef LITE_SHUTDOWN_LOG status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
#endif
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
......
...@@ -15,8 +15,10 @@ ...@@ -15,8 +15,10 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector>
#include "lite/backends/opencl/cl_half.h" #include "lite/backends/opencl/cl_half.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
...@@ -34,6 +36,10 @@ class ElementwiseAddImageCompute ...@@ -34,6 +36,10 @@ class ElementwiseAddImageCompute
void PrepareForRun() override; void PrepareForRun() override;
void ReInitWhenNeeded() override;
void GetGlobalWorkSize();
void Run() override; void Run() override;
std::string doc() const override { std::string doc() const override {
...@@ -42,8 +48,21 @@ class ElementwiseAddImageCompute ...@@ -42,8 +48,21 @@ class ElementwiseAddImageCompute
protected: protected:
param_t* ele_param_{nullptr}; param_t* ele_param_{nullptr};
DDim last_x_dims_;
DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim y_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
std::string kernel_func_name_{"elementwise_add"}; std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
bool first_epoch_for_reinit_{true};
cl::Kernel kernel_;
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -71,8 +71,10 @@ class ElementwiseMulImageCompute ...@@ -71,8 +71,10 @@ class ElementwiseMulImageCompute
VLOG(4) << "bias_dims.size():" << bias_dims.size(); VLOG(4) << "bias_dims.size():" << bias_dims.size();
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_); "image/elementwise_mul_kernel.cl",
build_options_,
time_stamp_);
} }
void Run() override { void Run() override {
...@@ -114,7 +116,7 @@ class ElementwiseMulImageCompute ...@@ -114,7 +116,7 @@ class ElementwiseMulImageCompute
#endif #endif
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto bias_dims = y->dims(); auto bias_dims = y->dims();
...@@ -201,6 +203,7 @@ class ElementwiseMulImageCompute ...@@ -201,6 +203,7 @@ class ElementwiseMulImageCompute
param_t* ele_param_{nullptr}; param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_mul"}; std::string kernel_func_name_{"elementwise_mul"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -49,8 +49,10 @@ void ElementwiseSubImageCompute::PrepareForRun() { ...@@ -49,8 +49,10 @@ void ElementwiseSubImageCompute::PrepareForRun() {
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/elementwise_sub_kernel.cl", build_options_); "image/elementwise_sub_kernel.cl",
build_options_,
time_stamp_);
} }
void ElementwiseSubImageCompute::Run() { void ElementwiseSubImageCompute::Run() {
...@@ -93,7 +95,7 @@ void ElementwiseSubImageCompute::Run() { ...@@ -93,7 +95,7 @@ void ElementwiseSubImageCompute::Run() {
#endif #endif
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; int arg_idx = 0;
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <string> #include <string>
#include "lite/backends/opencl/cl_half.h" #include "lite/backends/opencl/cl_half.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
...@@ -44,6 +45,7 @@ class ElementwiseSubImageCompute ...@@ -44,6 +45,7 @@ class ElementwiseSubImageCompute
param_t* ele_param_{nullptr}; param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_sub"}; std::string kernel_func_name_{"elementwise_sub"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h" #include "lite/utils/string.h"
...@@ -30,74 +31,98 @@ class FcCompute ...@@ -30,74 +31,98 @@ class FcCompute
public: public:
using param_t = operators::FcParam; using param_t = operators::FcParam;
void PrepareForRun() override { void PrepareForRun() override {}
const auto& param = *param_.get_mutable<param_t>();
const auto x_dims = param.input->dims();
const auto w_dims = param.w->dims();
void ReInitWhenNeeded() override {
fc_param_ = param_.get_mutable<param_t>();
const auto x_dims = fc_param_->input->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute m,n,k
const auto w_dims = fc_param_->w->dims();
CHECK_GE(x_dims.size(), 2UL); CHECK_GE(x_dims.size(), 2UL);
CHECK_GE(w_dims.size(), 2UL); CHECK_GE(w_dims.size(), 2UL);
CHECK_EQ(param.output->dims().size(), 2UL); CHECK_EQ(fc_param_->output->dims().size(), 2UL);
m_ = x_dims.Slice(0, param.in_num_col_dims).production(); m_ = x_dims.Slice(0, fc_param_->in_num_col_dims).production();
k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production(); k_ = x_dims.Slice(fc_param_->in_num_col_dims, x_dims.size()).production();
n_ = w_dims[1]; n_ = w_dims[1];
CHECK_EQ(k_, static_cast<int>(w_dims[0])); CHECK_EQ(k_, static_cast<int>(w_dims[0]));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2] VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
<< " " << x_dims[3]; << " " << x_dims[3];
VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2] VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
<< " " << w_dims[3]; << " " << w_dims[3];
VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_; VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
#endif
// choose kernel
if (m_ == 1) { // gemv if (m_ == 1) { // gemv
kernel_func_name_ = "fc_gemv_1x4"; kernel_func_name_ = "fc_gemv_1x4";
global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
} else { // gemm } else { // gemm
kernel_func_name_ = "fc_gemm_4x4"; kernel_func_name_ = "fc_gemm_4x4";
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
static_cast<size_t>((n_ + 3) / 4)};
} }
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
#endif
if (param.activation_type == "relu") { if (fc_param_->activation_type == "relu") {
build_options_ += "-DRELU"; build_options_ += "-DRELU";
} }
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "buffer/fc_kernel.cl", build_options_); "buffer/fc_kernel.cl",
build_options_,
time_stamp_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
if (m_ == 1) { // gemv
global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
} else { // gemm
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
static_cast<size_t>((n_ + 3) / 4)};
}
} }
void Run() override { void Run() override {
const auto& param = *param_.get_mutable<param_t>(); auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
auto& context = ctx_->As<OpenCLContext>(); auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
CHECK(context.cl_context() != nullptr); auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
auto* x_buf = param.input->data<float, cl::Buffer>();
auto* w_buf = param.w->data<float, cl::Buffer>();
auto* bias_buf = param.bias->data<float, cl::Buffer>();
auto* out_buf = auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto kernel = kernel_;
cl_int status; cl_int status;
int arg_idx = 0; status = kernel.setArg(0, *x_buf);
status = kernel.setArg(arg_idx, *x_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *w_buf); status = kernel.setArg(1, *w_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *bias_buf); status = kernel.setArg(2, *bias_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf); status = kernel.setArg(3, *out_buf);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(m_)); status = kernel.setArg(4, static_cast<const int>(m_));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(n_)); status = kernel.setArg(5, static_cast<const int>(n_));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(k_)); status = kernel.setArg(6, static_cast<const int>(k_));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
...@@ -111,9 +136,14 @@ class FcCompute ...@@ -111,9 +136,14 @@ class FcCompute
private: private:
int m_, n_, k_; int m_, n_, k_;
param_t* fc_param_{nullptr};
std::string kernel_func_name_{}; std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_float "}; std::string build_options_{"-DCL_DTYPE_float "};
std::string time_stamp_{GetTimeStamp()};
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
cl::NDRange global_work_size_; cl::NDRange global_work_size_;
cl::Kernel kernel_;
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -28,8 +28,10 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute { ...@@ -28,8 +28,10 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
void PrepareForRun() override { void PrepareForRun() override {
build_options_ += " -DRELU"; build_options_ += " -DRELU";
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_); "buffer/elementwise_add_kernel.cl",
build_options_,
time_stamp_);
ele_param_ = param_.get_mutable<param_t>(); ele_param_ = param_.get_mutable<param_t>();
UpdateParams(); UpdateParams();
auto act_t = static_cast<param_t*>(ele_param_)->act_type; auto act_t = static_cast<param_t*>(ele_param_)->act_type;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/kernels/opencl/elementwise_add_image_compute.h" #include "lite/kernels/opencl/elementwise_add_image_compute.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -30,8 +31,10 @@ class FusionElementwiseAddActivationImageCompute ...@@ -30,8 +31,10 @@ class FusionElementwiseAddActivationImageCompute
void PrepareForRun() override { void PrepareForRun() override {
build_options_ += " -DRELU"; build_options_ += " -DRELU";
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_); "image/elementwise_add_kernel.cl",
build_options_,
time_stamp_);
ele_param_ = param_.get_mutable<param_t>(); ele_param_ = param_.get_mutable<param_t>();
auto act_t = static_cast<param_t*>(ele_param_)->act_type; auto act_t = static_cast<param_t*>(ele_param_)->act_type;
VLOG(4) << "act: " << act_t; VLOG(4) << "act: " << act_t;
......
...@@ -39,96 +39,120 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -39,96 +39,120 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
} }
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(kernel_func_name_,
"image/grid_sampler_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
}
void ReInitWhenNeeded() override {
grid_param_ = param_.get_mutable<param_t>(); grid_param_ = param_.get_mutable<param_t>();
auto x_dims = grid_param_->x->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ =
default_convertor.InitImageDimInfoWith(grid_param_->out->dims());
// compute global work size
GetGlobalWorkSize();
}
}
auto& context = ctx_->As<OpenCLContext>(); void GetGlobalWorkSize() {
context.cl_context()->AddKernel( auto default_work_size =
kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_); DefaultWorkSize(grid_param_->out->dims(),
VLOG(4) << "kernel_func_name_:" << kernel_func_name_; DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_img_shape_[0]),
static_cast<int64_t>(out_img_shape_[1])}));
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2] / 4)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
VLOG(4) << "global_work_size_:[2D]:" << global_work_size_[0] << " "
<< global_work_size_[1] << " " << global_work_size_[2];
#endif
} }
void Run() override { void Run() override {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x = grid_param_->x; auto* x = grid_param_->x;
auto* out = grid_param_->out;
auto* grid = grid_param_->grid; auto* grid = grid_param_->grid;
auto* out = grid_param_->out;
auto out_dims = out->dims(); auto out_dims = out->dims();
auto in_dims = x->dims(); int out_height = out_dims[2];
int out_width = out_dims[3];
auto* x_img = x->data<half_t, cl::Image2D>();
auto* grid_img = x->data<half_t, cl::Image2D>();
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
out_img_shape_[1]);
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
auto in_dims = x->dims();
VLOG(4) << "x->target():" << TargetToStr(x->target()); VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target()); VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims; VLOG(4) << "x->dims():" << in_dims;
VLOG(4) << "out->dims():" << out_dims; VLOG(4) << "out->dims():" << out_dims;
#endif
auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>();
// VLOG(4) << "x_image: " << x_img; // VLOG(4) << "x_image: " << x_img;
auto* grid_img = x->data<half_t, cl::Image2D>();
// VLOG(4) << "grid_img: " << grid_img; // VLOG(4) << "grid_img: " << grid_img;
auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image" << out_img; // VLOG(4) << "out_image" << out_img;
VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " " VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
<< out_image_shape["height"]; << out_img_shape_[1];
#endif #endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; cl_int status;
int out_height = out_dims[2]; auto kernel = kernel_;
int out_width = out_dims[3]; status = kernel.setArg(0, *x_img);
auto default_work_size =
DefaultWorkSize(out_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
#endif
cl_int status = kernel.setArg(arg_idx++, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *grid_img); status = kernel.setArg(1, *grid_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *out_img); status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_height); status = kernel.setArg(3, out_height);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_width); status = kernel.setArg(4, out_width);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
auto global_work_size = auto& context = ctx_->As<OpenCLContext>();
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]), CHECK(context.cl_context() != nullptr);
static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2] / 4)};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_); context.cl_wait_list()->emplace(out_img, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
#endif
} }
protected: protected:
param_t* grid_param_{nullptr}; param_t* grid_param_{nullptr};
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
std::string kernel_func_name_{"grid_sampler"}; std::string kernel_func_name_{"grid_sampler"};
cl::Kernel kernel_;
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -74,6 +74,12 @@ static std::vector<size_t> DefaultWorkSize(const DDim& image_dim, ...@@ -74,6 +74,12 @@ static std::vector<size_t> DefaultWorkSize(const DDim& image_dim,
LOG(FATAL) << " not support this dim, need imp "; LOG(FATAL) << " not support this dim, need imp ";
} }
static const std::string GetTimeStamp() {
struct timeval time;
gettimeofday(&time, NULL);
return std::to_string(time.tv_usec);
}
} // namespace opencl } // namespace opencl
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -60,8 +60,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -60,8 +60,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
} }
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/instance_norm_kernel.cl", build_options_); "image/instance_norm_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
} }
...@@ -115,7 +117,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -115,7 +117,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
out_image_shape["width"], out_image_shape["height"]); out_image_shape["width"], out_image_shape["height"]);
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status = kernel.setArg(0, out_w); cl_int status = kernel.setArg(0, out_w);
...@@ -180,8 +182,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -180,8 +182,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
bias_image_.mutable_data<half_t, cl::Image2D>( bias_image_.mutable_data<half_t, cl::Image2D>(
scale_img_size[0], scale_img_size[1], bias_img.data()); scale_img_size[0], scale_img_size[1], bias_img.data());
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/instance_norm_kernel.cl", build_options_); "image/instance_norm_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
} }
...@@ -234,7 +238,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -234,7 +238,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
#endif #endif
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto* scale_img = scale_image_.data<half_t, cl::Image2D>(); auto* scale_img = scale_image_.data<half_t, cl::Image2D>();
auto* bias_img = bias_image_.data<half_t, cl::Image2D>(); auto* bias_img = bias_image_.data<half_t, cl::Image2D>();
...@@ -271,6 +275,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -271,6 +275,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
param_t* instance_norm_param_{nullptr}; param_t* instance_norm_param_{nullptr};
std::string kernel_func_name_{"instance_norm_onnx"}; std::string kernel_func_name_{"instance_norm_onnx"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor scale_image_; Tensor scale_image_;
Tensor bias_image_; Tensor bias_image_;
......
...@@ -48,7 +48,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -48,7 +48,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
beta_ = lrn_param_->beta; beta_ = lrn_param_->beta;
norm_region_ = lrn_param_->norm_region; norm_region_ = lrn_param_->norm_region;
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/lrn_kernel.cl", build_options_); kernel_func_name_, "image/lrn_kernel.cl", build_options_, time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
} }
...@@ -91,7 +91,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -91,7 +91,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
#endif #endif
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; int arg_idx = 0;
...@@ -152,6 +152,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL), ...@@ -152,6 +152,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
std::string norm_region_{"AcrossChannels"}; std::string norm_region_{"AcrossChannels"};
std::string kernel_func_name_{"lrn"}; std::string kernel_func_name_{"lrn"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "lite/backends/opencl/cl_include.h" #include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h" #include "lite/utils/string.h"
...@@ -32,8 +33,10 @@ class MulCompute ...@@ -32,8 +33,10 @@ class MulCompute
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "buffer/mat_mul_kernel.cl", build_options_); "buffer/mat_mul_kernel.cl",
build_options_,
time_stamp_);
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
const auto* x_data = param.x->data<float>(); const auto* x_data = param.x->data<float>();
const auto* y_data = param.y->data<float>(); const auto* y_data = param.y->data<float>();
...@@ -68,7 +71,7 @@ class MulCompute ...@@ -68,7 +71,7 @@ class MulCompute
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status; cl_int status;
...@@ -103,6 +106,7 @@ class MulCompute ...@@ -103,6 +106,7 @@ class MulCompute
int m_, n_, k_; int m_, n_, k_;
std::string kernel_func_name_{"mat_mul"}; std::string kernel_func_name_{"mat_mul"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_float"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -38,8 +38,10 @@ class NearestInterpComputeImageDefault ...@@ -38,8 +38,10 @@ class NearestInterpComputeImageDefault
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_); "image/nearest_interp_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
} }
...@@ -66,7 +68,7 @@ class NearestInterpComputeImageDefault ...@@ -66,7 +68,7 @@ class NearestInterpComputeImageDefault
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; int arg_idx = 0;
...@@ -121,6 +123,7 @@ class NearestInterpComputeImageDefault ...@@ -121,6 +123,7 @@ class NearestInterpComputeImageDefault
private: private:
std::string kernel_func_name_{"nearest_interp"}; std::string kernel_func_name_{"nearest_interp"};
std::string build_options_{" -DCL_DTYPE_half"}; std::string build_options_{" -DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -52,8 +52,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL), ...@@ -52,8 +52,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
} }
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/pad2d_kernel.cl", build_options_); "image/pad2d_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
} }
...@@ -93,7 +95,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL), ...@@ -93,7 +95,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
#endif #endif
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0; int arg_idx = 0;
...@@ -159,6 +161,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL), ...@@ -159,6 +161,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
param_t* pad2d_param_{nullptr}; param_t* pad2d_param_{nullptr};
std::string kernel_func_name_{}; std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -37,8 +37,10 @@ class PoolCompute ...@@ -37,8 +37,10 @@ class PoolCompute
const auto& param = *param_.get_mutable<param_t>(); const auto& param = *param_.get_mutable<param_t>();
kernel_func_name_ += param.pooling_type; kernel_func_name_ += param.pooling_type;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "buffer/pool_kernel.cl", build_options_); "buffer/pool_kernel.cl",
build_options_,
time_stamp_);
} }
void Run() override { void Run() override {
...@@ -69,7 +71,7 @@ class PoolCompute ...@@ -69,7 +71,7 @@ class PoolCompute
auto* output_buf = auto* output_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL)); param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status; cl_int status;
auto numel = out_dims.production(); auto numel = out_dims.production();
...@@ -117,6 +119,7 @@ class PoolCompute ...@@ -117,6 +119,7 @@ class PoolCompute
private: private:
std::string kernel_func_name_{"pool_"}; std::string kernel_func_name_{"pool_"};
std::string build_options_{"-DCL_DTYPE_float"}; std::string build_options_{"-DCL_DTYPE_float"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -47,7 +47,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -47,7 +47,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel( context.cl_context()->AddKernel(
kernel_func_name_, "image/pool_kernel.cl", build_options_); kernel_func_name_, "image/pool_kernel.cl", build_options_, time_stamp_);
} }
void Run() override { void Run() override {
...@@ -112,7 +112,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -112,7 +112,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
// VLOG(4) << "out_image" << out_img; // VLOG(4) << "out_image" << out_img;
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int c_block = (out_dims[1] + 3) / 4; int c_block = (out_dims[1] + 3) / 4;
...@@ -164,6 +164,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -164,6 +164,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
private: private:
std::string kernel_func_name_{"pool_"}; std::string kernel_func_name_{"pool_"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -36,8 +36,10 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL), ...@@ -36,8 +36,10 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/reshape_kernel.cl", build_options_); "image/reshape_kernel.cl",
build_options_,
time_stamp_);
} }
void Run() override { void Run() override {
...@@ -110,7 +112,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL), ...@@ -110,7 +112,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG #ifndef LITE_SHUTDOWN_LOG
...@@ -166,6 +168,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL), ...@@ -166,6 +168,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
private: private:
std::string kernel_func_name_{"reshape"}; std::string kernel_func_name_{"reshape"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -37,53 +37,66 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -37,53 +37,66 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(kernel_func_name_,
"image/scale_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(
kernel_func_name_, "image/scale_kernel.cl", build_options_); STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
void ReInitWhenNeeded() override {
scale_param_ = param_.get_mutable<param_t>();
auto x_dims = scale_param_->x->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ =
default_convertor.InitImageDimInfoWith(scale_param_->output->dims());
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(out_img_shape_[0]),
static_cast<cl::size_type>(out_img_shape_[1])};
} }
void Run() override { void Run() override {
const auto& param = *param_.get_mutable<param_t>(); auto* x_img = scale_param_->x->data<half_t, cl::Image2D>();
const auto& in_dims = param.x->dims(); auto* out_img = scale_param_->output->mutable_data<half_t, cl::Image2D>(
auto* x_img = param.x->data<half_t, cl::Image2D>(); out_img_shape_[0], out_img_shape_[1]);
const float scale = param.scale; const float scale = scale_param_->scale;
const float bias = param.bias; const float bias = scale_param_->bias;
// LOG(INFO) << "x_image" << x_img;
auto out_image_shape = InitImageDimInfoWith(in_dims);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"];
#endif
auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
// LOG(INFO) << "out_image" << out_img;
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
static_cast<cl::size_type>(out_image_shape["height"])};
auto kernel = kernel_;
cl_int status; cl_int status;
int arg_idx = 0; status = kernel.setArg(0, *x_img);
status = kernel.setArg(arg_idx, *x_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img); status = kernel.setArg(1, *out_img);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, scale); status = kernel.setArg(2, scale);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, bias); status = kernel.setArg(3, bias);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel, kernel,
cl::NullRange, cl::NullRange,
global_work_size, global_work_size_,
cl::NullRange, cl::NullRange,
nullptr, nullptr,
event_.get()); event_.get());
...@@ -94,7 +107,17 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -94,7 +107,17 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
private: private:
std::string kernel_func_name_{"scale"}; std::string kernel_func_name_{"scale"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
param_t* scale_param_{nullptr};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
}; };
} // namespace opencl } // namespace opencl
......
...@@ -38,8 +38,10 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -38,8 +38,10 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override { void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
VLOG(1) << "kernel_func_name_:" << kernel_func_name_; VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel( context.cl_context()->AddKernel(kernel_func_name_,
kernel_func_name_, "image/slice_kernel.cl", build_options_); "image/slice_kernel.cl",
build_options_,
time_stamp_);
} }
void Run() override { void Run() override {
...@@ -68,7 +70,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -68,7 +70,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>(); auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr); CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key; STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_; kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str()); auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status; cl_int status;
...@@ -108,6 +110,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL), ...@@ -108,6 +110,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
private: private:
std::string kernel_func_name_{"slice"}; std::string kernel_func_name_{"slice"};
std::string build_options_{"-DCL_DTYPE_half"}; std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event}; std::shared_ptr<cl::Event> event_{new cl::Event};
}; };
......
...@@ -25,6 +25,7 @@ lite_cc_library(subgraph_bridge_layer_norm_op_xpu SRCS layer_norm_op.cc DEPS ${x ...@@ -25,6 +25,7 @@ lite_cc_library(subgraph_bridge_layer_norm_op_xpu SRCS layer_norm_op.cc DEPS ${x
lite_cc_library(subgraph_bridge_dropout_op_xpu SRCS dropout_op.cc DEPS ${xpu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_dropout_op_xpu SRCS dropout_op.cc DEPS ${xpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_matmul_op_xpu SRCS matmul_op.cc DEPS ${xpu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_matmul_op_xpu SRCS matmul_op.cc DEPS ${xpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_cast_op_xpu SRCS cast_op.cc DEPS ${xpu_subgraph_bridge_deps}) lite_cc_library(subgraph_bridge_cast_op_xpu SRCS cast_op.cc DEPS ${xpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_yolo_box_op_xpu SRCS yolo_box_op.cc DEPS ${xpu_subgraph_bridge_deps})
set(xpu_subgraph_bridges set(xpu_subgraph_bridges
subgraph_bridge_registry subgraph_bridge_registry
...@@ -48,6 +49,7 @@ set(xpu_subgraph_bridges ...@@ -48,6 +49,7 @@ set(xpu_subgraph_bridges
subgraph_bridge_dropout_op_xpu subgraph_bridge_dropout_op_xpu
subgraph_bridge_matmul_op_xpu subgraph_bridge_matmul_op_xpu
subgraph_bridge_cast_op_xpu subgraph_bridge_cast_op_xpu
subgraph_bridge_yolo_box_op_xpu
CACHE INTERNAL "xpu_subgraph_bridges") CACHE INTERNAL "xpu_subgraph_bridges")
message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}") message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}")
...@@ -37,3 +37,4 @@ USE_SUBGRAPH_BRIDGE(gelu, kXPU); ...@@ -37,3 +37,4 @@ USE_SUBGRAPH_BRIDGE(gelu, kXPU);
USE_SUBGRAPH_BRIDGE(dropout, kXPU); USE_SUBGRAPH_BRIDGE(dropout, kXPU);
USE_SUBGRAPH_BRIDGE(matmul, kXPU); USE_SUBGRAPH_BRIDGE(matmul, kXPU);
USE_SUBGRAPH_BRIDGE(cast, kXPU); USE_SUBGRAPH_BRIDGE(cast, kXPU);
USE_SUBGRAPH_BRIDGE(yolo_box, kXPU);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace xpu {
int YoloBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[XPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindTensor(x_name);
auto img_size_name = op_info->Input("ImgSize").front();
auto img_size = scope->FindTensor(img_size_name);
auto boxes_name = op_info->Output("Boxes").front();
auto scores_name = op_info->Output("Scores").front();
auto anchors = op_info->GetAttr<std::vector<int>>("anchors");
auto class_num = op_info->GetAttr<int>("class_num");
auto conf_thresh = op_info->GetAttr<float>("conf_thresh");
auto downsample_ratio = op_info->GetAttr<int>("downsample_ratio");
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x);
}
// ImgSize node
std::shared_ptr<Node> img_size_node = nullptr;
if (graph->Has(img_size_name)) {
img_size_node = graph->Get(img_size_name);
} else {
img_size_node = graph->Add(img_size_name, *img_size);
}
// Softmax node
auto yolo_box_data =
graph->builder_.CreateYoloBox(*x_node->data(),
*img_size_node->data(),
CvtShape<xtcl::Integer>(anchors),
class_num,
conf_thresh,
downsample_ratio);
graph->Add(boxes_name, graph->builder_.GetField(yolo_box_data, 0));
graph->Add(scores_name, graph->builder_.GetField(yolo_box_data, 1));
return SUCCESS;
}
} // namespace xpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(yolo_box,
kXPU,
paddle::lite::subgraph::xpu::YoloBoxConverter);
...@@ -34,7 +34,7 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -34,7 +34,7 @@ int SubgraphEngine::BuildDeviceProgram() {
subgraph::xpu::Graph graph; subgraph::xpu::Graph graph;
const auto& bridges = subgraph::Registry::Instance(); const auto& bridges = subgraph::Registry::Instance();
for (auto& inst : origin_program_) { for (auto& inst : origin_program_) {
auto op = inst.op(); auto op = const_cast<OpLite*>(inst.op());
CHECK(op); CHECK(op);
op->CheckShape(); op->CheckShape();
op->InferShape(); op->InferShape();
...@@ -43,10 +43,8 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -43,10 +43,8 @@ int SubgraphEngine::BuildDeviceProgram() {
return subgraph::FAILED; return subgraph::FAILED;
} }
auto kernel = inst.kernel(); auto kernel = inst.kernel();
status |= status |= bridges.Select(op_type, TARGET(kXPU))(
bridges.Select(op_type, TARGET(kXPU))(reinterpret_cast<void*>(&graph), reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
const_cast<OpLite*>(op),
const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) { if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED; return subgraph::FAILED;
} }
......
...@@ -382,7 +382,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) { ...@@ -382,7 +382,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
pb_dims->Resize(static_cast<int>(dims.size()), 0); pb_dims->Resize(static_cast<int>(dims.size()), 0);
auto dims_vec = dims.Vectorize(); auto dims_vec = dims.Vectorize();
std::copy(dims_vec.begin(), dims_vec.end(), pb_dims->begin()); std::copy(dims_vec.begin(), dims_vec.end(), pb_dims->begin());
int32_t size = desc.ByteSize(); int32_t size = desc.ByteSizeLong();
os.write(reinterpret_cast<const char *>(&size), sizeof(size)); os.write(reinterpret_cast<const char *>(&size), sizeof(size));
auto out = desc.SerializeAsString(); auto out = desc.SerializeAsString();
os.write(out.data(), size); os.write(out.data(), size);
......
...@@ -141,13 +141,12 @@ add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS}) ...@@ -141,13 +141,12 @@ add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS})
# 4. training op # 4. training op
add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS}) add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS})
if (LITE_WITH_TRAIN)
add_operator(mean_grad_op extra SRCS mean_grad_op.cc DEPS ${op_DEPS}) add_operator(mean_grad_op train SRCS mean_grad_op.cc DEPS ${op_DEPS})
add_operator(activation_grad_ops basic SRCS activation_grad_ops.cc DEPS ${op_DEPS}) add_operator(activation_grad_ops train SRCS activation_grad_ops.cc DEPS ${op_DEPS})
add_operator(elementwise_grad_op extra SRCS elementwise_grad_ops.cc DEPS ${op_DEPS}) add_operator(elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS ${op_DEPS})
add_operator(mul_grad_op basic SRCS mul_grad_op.cc DEPS ${op_DEPS}) add_operator(mul_grad_op train SRCS mul_grad_op.cc DEPS ${op_DEPS})
add_operator(sgd_op extra SRCS sgd_op.cc DEPS ${op_DEPS}) add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS})
endif()
if (NOT LITE_WITH_X86) if (NOT LITE_WITH_X86)
lite_cc_test(test_fc_op SRCS fc_op_test.cc lite_cc_test(test_fc_op SRCS fc_op_test.cc
......
...@@ -71,6 +71,9 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) { ...@@ -71,6 +71,9 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
} else if (opdesc.Type() == "exp") { } else if (opdesc.Type() == "exp") {
// exp // exp
param_.active_type = lite_api::ActivationType::kExp; param_.active_type = lite_api::ActivationType::kExp;
} else if (opdesc.Type() == "abs") {
// abs
param_.active_type = lite_api::ActivationType::kAbs;
} }
VLOG(4) << "opdesc.Type():" << opdesc.Type(); VLOG(4) << "opdesc.Type():" << opdesc.Type();
...@@ -92,6 +95,7 @@ REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp); ...@@ -92,6 +95,7 @@ REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册