未验证 提交 b7771b58 编写于 作者: myq406450149's avatar myq406450149 提交者: GitHub

Merge pull request #6 from PaddlePaddle/develop

sync
......@@ -67,7 +67,7 @@ lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF)
lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF)
lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF)
lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF)
lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF IF LITE_WITH_PROFILE)
lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF)
lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
......
......@@ -152,9 +152,10 @@ endif()
if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE")
if (LITE_WITH_PRECISION_PROFILE)
endif()
if (LITE_WITH_PRECISION_PROFILE)
add_definitions("-DLITE_WITH_PRECISION_PROFILE")
endif()
endif()
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
......
......@@ -307,6 +307,9 @@ function(add_kernel TARGET device level)
if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
return()
endif()
if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
return()
endif()
if ("${device}" STREQUAL "Host")
......@@ -322,16 +325,11 @@ function(add_kernel TARGET device level)
set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "X86")
if (NOT LITE_WITH_X86)
if (NOT LITE_WITH_X86 OR LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS})
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
endif()
......@@ -434,11 +432,13 @@ function(add_operator TARGET level)
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
return()
endif()
if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
return()
endif()
foreach(src ${args_SRCS})
if(LITE_BUILD_TAILOR)
......
......@@ -108,8 +108,8 @@ if (LITE_WITH_PYTHON)
add_dependencies(publish_inference publish_inference_python_light_demo)
endif()
if (LITE_WITH_X86)
add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
if (LITE_WITH_CUDA OR LITE_WITH_X86)
add_custom_target(publish_inference_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
......@@ -117,50 +117,44 @@ if (LITE_WITH_X86)
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_custom_target(publish_inference_third_party ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
add_dependencies(publish_inference_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_cxx_lib)
add_dependencies(publish_inference publish_inference_third_party)
endif()
if (LITE_WITH_X86)
add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
)
add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
add_dependencies(publish_inference_x86_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_x86_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_x86_cxx_lib)
add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
)
add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
add_dependencies(publish_inference publish_inference_x86_cxx_lib)
add_dependencies(publish_inference publish_inference_x86_cxx_demos)
endif()
if(LITE_WITH_CUDA)
add_custom_target(publish_inference_cuda_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_cuda_cxx_lib)
add_custom_target(publish_inference_cuda_cxx_demos ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
)
add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos)
add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared)
add_dependencies(publish_inference publish_inference_cuda_cxx_demos)
endif(LITE_WITH_CUDA)
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (NOT LITE_ON_TINY_PUBLISH)
# add cxx lib
......@@ -192,7 +186,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
add_dependencies(publish_inference publish_inference_cxx_lib)
if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.so)
endif()
endif()
else()
......
......@@ -8,11 +8,12 @@ if (LITE_ON_TINY_PUBLISH)
set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
set(light_lib_DEPS light_api paddle_api paddle_api_light)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library
add_library(paddle_full_api_shared SHARED "")
target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
DEPS paddle_api paddle_api_light paddle_api_full)
add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
target_link_libraries(paddle_full_api_shared framework_proto)
if(LITE_WITH_X86)
......@@ -27,13 +28,13 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
endif(LITE_WITH_CUDA)
#light api dynamic library
lite_cc_library(paddle_light_api_shared MODULE
SRCS light_api_shared.cc
lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc
DEPS ${light_lib_DEPS}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels})
NPU_DEPS ${npu_kernels}
)
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
......@@ -262,7 +263,10 @@ if (NOT LITE_ON_TINY_PUBLISH)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
FPGA_DEPS ${fpga_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
BM_DEPS ${bm_kernels})
# The final inference library for just MobileConfig.
bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
target_link_libraries(paddle_api_full ${cuda_deps})
......@@ -311,7 +315,7 @@ add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
message(STATUS "Compiling opt")
lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
DEPS gflags kernel op optimizer mir_passes utils)
DEPS gflags kernel op optimizer mir_passes utils ${host_kernels})
add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
......
......@@ -44,7 +44,10 @@ DEFINE_string(input_shape,
"set input shapes according to the model, "
"separated by colon and comma, "
"such as 1,3,244,244");
DEFINE_string(input_img_path, "", "the path of input image");
DEFINE_string(input_img_path,
"",
"the path of input image, if not set "
"input_img_path, the input of model will be 1.0.");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_int32(power_mode,
......@@ -57,16 +60,11 @@ DEFINE_int32(power_mode,
DEFINE_int32(threads, 1, "threads num");
DEFINE_string(result_filename,
"result.txt",
"save benchmark "
"result to the file");
"save the inference time to the file.");
DEFINE_bool(run_model_optimize,
false,
"if set true, apply model_optimize_tool to "
"model and use optimized model to test. ");
DEFINE_bool(is_quantized_model,
false,
"if set true, "
"test the performance of the quantized model. ");
namespace paddle {
namespace lite_api {
......@@ -87,10 +85,6 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
std::vector<Place> vaild_places = {
Place{TARGET(kARM), PRECISION(kFloat)},
};
if (FLAGS_is_quantized_model) {
vaild_places.insert(vaild_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)});
}
config.set_valid_places(vaild_places);
auto predictor = lite_api::CreatePaddlePredictor(config);
......@@ -181,8 +175,8 @@ void Run(const std::vector<int64_t>& input_shape,
int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "" || FLAGS_result_filename == "") {
LOG(INFO) << "please run ./benchmark_bin --help to obtain usage.";
if (FLAGS_model_dir == "") {
LOG(INFO) << "Please run ./benchmark_bin --help to obtain usage.";
exit(0);
}
......
......@@ -19,6 +19,7 @@
#include <string>
#include <utility>
#include <vector>
#include "lite/api/paddle_use_passes.h"
#include "lite/utils/io.h"
namespace paddle {
......@@ -295,6 +296,8 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
// Analysis whether the modle is quantized.
// For quantized model, add place(arm, int8) to inner_places
const std::vector<std::string> quant_dequant_op = {
"fake_quantize_abs_max",
"fake_quantize_range_abs_max",
......@@ -317,7 +320,8 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
}
}
if (is_quantized_model) {
inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)});
inner_places.insert(inner_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)});
}
Program program(desc, scope_, inner_places);
......
......@@ -31,10 +31,17 @@ namespace lite {
void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
config_ = config;
auto places = config.valid_places();
#ifdef LITE_WITH_CUDA
// if kCUDA is included in valid places, it should be initialized first,
// otherwise skip this step.
for (auto &p : places) {
if (p.target == TARGET(kCUDA)) {
Env<TARGET(kCUDA)>::Init();
break;
}
}
#endif
auto places = config.valid_places();
std::vector<std::string> passes{};
auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS");
......
......@@ -13,13 +13,9 @@
// limitations under the License.
#include "lite/api/light_api.h"
#include <algorithm>
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h"
#endif
#include <algorithm>
namespace paddle {
namespace lite {
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/api/paddle_api.h"
namespace paddle {
namespace lite_api {
void RunModel() {
// 1. Set MobileConfig
MobileConfig mobile_config;
// 2. Create PaddlePredictor by MobileConfig
std::shared_ptr<PaddlePredictor> mobile_predictor =
CreatePaddlePredictor<MobileConfig>(mobile_config);
}
} // namespace lite_api
} // namespace paddle
......@@ -23,6 +23,7 @@
#include "kernel_src_map.h" // NOLINT
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/core/op_registry.h"
......
......@@ -99,7 +99,8 @@ enum class ActivationType : int {
kTanh = 6,
kSwish = 7,
kExp = 8,
NUM = 9,
kAbs = 9,
NUM = 10,
};
static size_t PrecisionTypeLength(PrecisionType type) {
......
......@@ -36,7 +36,8 @@ void TestModel(const std::vector<Place>& valid_places) {
predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
input_tensor->Resize(DDim(
std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
if (FLAGS_input_img_txt_path.empty()) {
......@@ -67,15 +68,13 @@ void TestModel(const std::vector<Place>& valid_places) {
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto* out = predictor.GetOutput(0);
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
auto* out_data = out->data<float>();
auto out = predictor.GetOutputs();
FILE* fp = fopen("result.txt", "wb");
for (int i = 0; i < out->numel(); i++) {
fprintf(fp, "%f\n", out_data[i]);
for (int i = 0; i < out.size(); i++) {
auto* out_data = out[i]->data<float>();
for (int j = 0; j < out[i]->numel(); j++) {
fprintf(fp, "%f\n", out_data[j]);
}
}
fclose(fp);
}
......
......@@ -13,7 +13,9 @@
// limitations under the License.
#include <gflags/gflags.h>
#ifdef PADDLE_WITH_TESTING
#include <gtest/gtest.h>
#endif
#include <string>
#include <vector>
#include "lite/api/cxx_api.h"
......
......@@ -29,6 +29,7 @@ enum class BinaryOperation {
kADD = 0,
kMUL = 1,
kDIV = 2,
kSUB = 3,
};
template <typename T>
......@@ -41,6 +42,7 @@ __device__ __forceinline__ float binary_calc(float x,
if (type == BinaryOperation::kADD) return x + y;
if (type == BinaryOperation::kMUL) return x * y;
if (type == BinaryOperation::kDIV) return x / y;
if (type == BinaryOperation::kSUB) return x - y;
}
template <typename T>
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/backends/opencl/cl_context.h"
#include <algorithm>
#include <memory>
#include <string>
#include <utility>
......@@ -35,8 +36,10 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
STL::stringstream program_key_ss;
program_key_ss << file_name << options;
std::string program_key = program_key_ss.str();
auto it = programs_.find(program_key);
if (it != programs_.end()) {
auto &programs = CLRuntime::Global()->programs();
auto it = programs.find(program_key);
if (it != programs.end()) {
VLOG(3) << " --- program -> " << program_key << " has been built --- ";
return *(it->second);
}
......@@ -47,14 +50,15 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
CLRuntime::Global()->BuildProgram(program.get(), options);
VLOG(3) << " --- end build program -> " << program_key << " --- ";
programs_[program_key] = std::move(program);
programs[program_key] = std::move(program);
return *(programs_[program_key]);
return *(programs[program_key]);
}
void CLContext::AddKernel(const std::string &kernel_name,
const std::string &file_name,
const std::string &options) {
const std::string &options,
const std::string &time_stamp) {
cl_int status{CL_SUCCESS};
VLOG(3) << " --- to get program " << file_name << " --- ";
auto program = GetProgram(file_name, options);
......@@ -64,24 +68,29 @@ void CLContext::AddKernel(const std::string &kernel_name,
new cl::Kernel(program, kernel_name.c_str(), &status));
CL_CHECK_FATAL(status);
VLOG(3) << " --- end create kernel --- ";
kernels_.emplace_back(std::move(kernel));
auto &kernels = CLRuntime::Global()->kernels();
auto &kernel_offset_map = CLRuntime::Global()->kernel_offset();
kernels.emplace_back(std::move(kernel));
STL::stringstream kernel_key;
kernel_key << kernel_name << options;
kernel_offset_[kernel_key.str()] = kernels_.size() - 1;
kernel_key << kernel_name << options << time_stamp;
kernel_offset_map[kernel_key.str()] = kernels.size() - 1;
}
cl::Kernel &CLContext::GetKernel(const int index) {
VLOG(3) << " --- kernel count: " << kernels_.size() << " --- ";
CHECK(static_cast<size_t>(index) < kernels_.size())
auto &kernels = CLRuntime::Global()->kernels();
VLOG(3) << " --- kernel count: " << kernels.size() << " --- ";
CHECK(static_cast<size_t>(index) < kernels.size())
<< "The index must be less than the size of kernels.";
CHECK(kernels_[index] != nullptr)
CHECK(kernels[index] != nullptr)
<< "The target kernel pointer cannot be null.";
return *(kernels_[index]);
return *(kernels[index]);
}
cl::Kernel &CLContext::GetKernel(const std::string &name) {
auto it = kernel_offset_.find(name);
CHECK(it != kernel_offset_.end()) << "Cannot find the kernel function: "
auto &kernel_offset_map = CLRuntime::Global()->kernel_offset();
auto it = kernel_offset_map.find(name);
CHECK(it != kernel_offset_map.end()) << "Cannot find the kernel function: "
<< name;
return GetKernel(it->second);
}
......@@ -121,14 +130,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
}
}
cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
size_t max_work_size,
int divisor) {
int preferred_lws = 0;
#if 1
auto gws0 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws2 = global_work_size[2];
#else
auto gws2 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws0 = global_work_size[2];
#endif
if (divisor > 1) {
max_work_size /= divisor;
}
if (preferred_lws > 0 && preferred_lws <= max_work_size) {
max_work_size = preferred_lws;
}
while (gws1 > max_work_size && max_work_size > 0) {
gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
}
while (gws2 * gws1 > max_work_size && max_work_size > 0) {
gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
}
while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
}
#if 1
return cl::NDRange{static_cast<size_t>(gws0),
static_cast<size_t>(gws1),
static_cast<size_t>(gws2)};
#else
return cl::NDRange{static_cast<size_t>(gws2),
static_cast<size_t>(gws1),
static_cast<size_t>(gws0)};
#endif
}
cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
size_t max_work_size) {
int preferred_lws = 0;
int divisor = 2;
auto tmp0 = global_work_size[0];
auto tmp1 = global_work_size[1];
auto tmp2 = global_work_size[2];
auto gws0 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws2 = global_work_size[2];
if (divisor > 1) {
max_work_size /= divisor;
......@@ -136,18 +184,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
if (preferred_lws > 0 && preferred_lws <= max_work_size) {
max_work_size = preferred_lws;
}
while (tmp1 > max_work_size && max_work_size > 0) {
tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
while (gws1 > max_work_size && max_work_size > 0) {
gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
}
while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
while (gws2 * gws1 > max_work_size && max_work_size > 0) {
gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
}
while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
}
return cl::NDRange{static_cast<size_t>(tmp0),
static_cast<size_t>(tmp1),
static_cast<size_t>(tmp2)};
return cl::NDRange{static_cast<size_t>(gws0),
static_cast<size_t>(gws1),
static_cast<size_t>(gws2)};
}
} // namespace lite
......
......@@ -36,7 +36,8 @@ class CLContext {
void AddKernel(const std::string &kernel_name,
const std::string &file_name,
const std::string &options = "");
const std::string &options = "",
const std::string &time_stamp = "");
cl::Kernel &GetKernel(const int index);
......@@ -46,10 +47,11 @@ class CLContext {
cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
private:
std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
std::vector<std::unique_ptr<cl::Kernel>> kernels_;
std::map<std::string, int> kernel_offset_;
cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
size_t max_work_size,
int divitor = 2);
// cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
// size_t max_work_size);
};
} // namespace lite
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "lite/backends/opencl/cl_runtime.h"
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "lite/utils/cp_logging.h"
......@@ -29,10 +30,26 @@ CLRuntime* CLRuntime::Global() {
CLRuntime::~CLRuntime() {
if (command_queue_ != nullptr) {
command_queue_->flush();
command_queue_->finish();
}
// For controlling the destruction order:
for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
clReleaseKernel(kernels_[kidx]->get());
kernels_[kidx].reset();
}
kernels_.clear();
kernel_offset_.clear();
for (auto& p : programs_) {
clReleaseProgram(p.second->get());
}
programs_.clear();
// For controlling the destruction order
command_queue_&& clReleaseCommandQueue(command_queue_->get());
command_queue_.reset();
context_&& clReleaseContext(context_->get());
context_.reset();
device_.reset();
platform_.reset();
......@@ -73,14 +90,14 @@ cl::CommandQueue& CLRuntime::command_queue() {
return *command_queue_;
}
std::unique_ptr<cl::Program> CLRuntime::CreateProgram(
std::shared_ptr<cl::Program> CLRuntime::CreateProgram(
const cl::Context& context, std::string file_name) {
auto cl_file = opencl_kernels_files.find(file_name);
std::string content(cl_file->second.begin(), cl_file->second.end());
cl::Program::Sources sources;
sources.push_back(content);
auto prog =
std::unique_ptr<cl::Program>(new cl::Program(context, sources, &status_));
std::shared_ptr<cl::Program>(new cl::Program(context, sources, &status_));
VLOG(4) << "OpenCL kernel file name: " << file_name;
VLOG(4) << "Program source size: " << content.size();
CL_CHECK_FATAL(status_);
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_utility.h"
......@@ -42,7 +43,7 @@ class CLRuntime {
cl::CommandQueue& command_queue();
std::unique_ptr<cl::Program> CreateProgram(const cl::Context& context,
std::shared_ptr<cl::Program> CreateProgram(const cl::Context& context,
std::string file_name);
std::unique_ptr<cl::UserEvent> CreateEvent(const cl::Context& context);
......@@ -57,6 +58,12 @@ class CLRuntime {
std::map<std::string, size_t>& GetDeviceInfo();
std::unordered_map<std::string, std::shared_ptr<cl::Program>>& programs() {
return programs_;
}
std::vector<std::unique_ptr<cl::Kernel>>& kernels() { return kernels_; }
std::map<std::string, int>& kernel_offset() { return kernel_offset_; }
private:
CLRuntime() = default;
......@@ -98,6 +105,12 @@ class CLRuntime {
std::shared_ptr<cl::CommandQueue> command_queue_{nullptr};
std::unordered_map<std::string, std::shared_ptr<cl::Program>> programs_{};
std::vector<std::unique_ptr<cl::Kernel>> kernels_{};
std::map<std::string, int> kernel_offset_{};
cl_int status_{CL_SUCCESS};
bool initialized_{false};
......
......@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error);
__FILE__, \
__LINE__); \
}
#ifndef LITE_SHUTDOWN_LOG
#define CL_CHECK_FATAL(err_code__) \
if (err_code__ != CL_SUCCESS) { \
LOG(FATAL) << string_format( \
......@@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error);
__FILE__, \
__LINE__); \
}
#else
#define CL_CHECK_FATAL(err_code__)
#endif
} // namespace lite
} // namespace paddle
......@@ -181,7 +181,11 @@ class Context<TargetType::kCUDA> {
Env<TargetType::kCUDA>::Global();
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {
if (devs.size() > 0) {
cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
} else {
LOG(INFO) << "No cuda device(s) found, CUDAContext init failed.";
}
}
void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
CHECK_GT(devs.size(), 0UL)
......
......@@ -142,7 +142,7 @@ class Env {
// Get device count
count = API::num_devices();
if (count == 0) {
CHECK(false) << "No device found!";
LOG(INFO) << "No " << TargetToStr(Type) << " device(s) found!";
} else {
LOG(INFO) << "Found " << count << " device(s)";
}
......
......@@ -26,7 +26,8 @@ namespace mir {
void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// initialze fuser params
std::vector<bool> conv_has_bias_cases{true, false};
std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
std::vector<std::string> conv_type_cases{
"conv2d", "depthwise_conv2d", "conv2d_transpose"};
// start fuse using params
for (auto conv_has_bias : conv_has_bias_cases) {
for (auto conv_type : conv_type_cases) {
......
......@@ -103,10 +103,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
auto conv_weight_t =
scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
if (conv_type_ == "conv2d_transpose") {
CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
static_cast<size_t>(conv_weight_t->dims()[1]))
<< "The BN bias's size should be equal to the size of the first "
<< "dim size of the conv weights";
} else {
CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
static_cast<size_t>(conv_weight_t->dims()[0]))
<< "The BN bias's size should be equal to the size of the first "
<< "dim size of the conv weights";
}
size_t weight_num = conv_weight_t->data_size();
bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
bool is_weight_quantization =
......@@ -153,6 +160,22 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
// compute new conv_weight for int8
auto weight_scale =
conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
if (conv_type_ == "conv2d_transpose") {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
for (unsigned int i = 0; i < h; ++i) {
weight_scale[i] *= fabsf(alpha_data[i]);
if (alpha_data[i] < 0.f) {
auto ptr_row = conv_weight_d + k * c_size + i * hw;
for (unsigned int j = 0; j < hw; ++j) {
ptr_row[j] *= -1;
}
}
}
}
} else {
for (unsigned int i = 0; i < h; ++i) {
weight_scale[i] *= fabsf(alpha_data[i]);
if (alpha_data[i] < 0.f) {
......@@ -162,6 +185,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
}
}
}
}
conv_op_desc->SetAttr("weight_scale", weight_scale);
} else if (is_weight_quantization) {
std::string scale_name = conv_weight_name + "_quant_scale";
......@@ -176,12 +200,26 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
} else {
// compute new conv_weight
auto conv_weight_d = conv_weight_t->mutable_data<float>();
if (conv_type_ == "conv2d_transpose") {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) {
for (unsigned int i = 0; i < h; ++i) {
auto ptr_row = conv_weight_d + k * c_size + i * hw;
for (unsigned int j = 0; j < hw; ++j) {
ptr_row[j] *= alpha_data[i];
}
}
}
} else {
for (unsigned int i = 0; i < h; ++i) { // n: conv2d output channels
for (unsigned int j = 0; j < w; ++j) { // w: conv2d input channels
conv_weight_d[i * w + j] *= alpha_data[i];
}
}
}
}
// compute new conv_bias
if (conv_has_bias_ && conv_op_desc->HasInput("Bias") &&
......
......@@ -44,11 +44,9 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser(graph.get());
}
// delete quant_dequant_node
for (auto op_type : {"pool2d", "softmax", "elementwise_add"}) {
fusion::DeleteQuantDequantOpFuser fuser(op_type);
fuser(graph.get());
}
// process quant_dequant_node
fusion::DeleteQuantDequantOpFuser dqd_fuser;
dqd_fuser(graph.get());
}
} // namespace mir
......
......@@ -50,7 +50,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
auto* output_scale_node = matched.at("output_scale_node");
auto* output_act_node = matched.at("output_act_node");
// obtain values, save values and relink node
// obtain scale, save attrs and relink node
int bit_length = quant_node->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1);
auto* scope = quant_node->stmt()->op()->scope();
......@@ -58,11 +58,22 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
->GetMutable<lite::Tensor>();
float scale_value = scale_tensor->data<float>()[0] / range;
auto in_act_name = input_act_node->arg()->name;
auto out_act_name = output_act_node->arg()->name;
auto outlinks = output_act_node->outlinks;
for (auto* quantized_node : outlinks) {
auto* op_desc = quantized_node->stmt()->mutable_op_info();
op_desc->SetAttr<int>("bit_length", bit_length);
op_desc->SetAttr<float>("input_scale", scale_value);
// save input scale in quantized op by input argname + index
auto op_desc = *quantized_node->stmt()->mutable_op_info();
std::string argname;
int index;
op_desc.GetInputArgname(out_act_name, &argname);
op_desc.GetInputIndex(out_act_name, &index);
op_desc.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
scale_value);
op_desc.SetAttr<float>("input_scale", scale_value); // save it for now
op_desc.SetAttr<int>("bit_length", bit_length);
op_desc.UpdateAllInputs(out_act_name, in_act_name);
quantized_node->stmt()->ResetOp(op_desc, graph->valid_places());
IR_NODE_LINK_TO(input_act_node, quantized_node)
}
......@@ -125,18 +136,17 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
auto* dequant_op = matched.at("dequant_op");
auto* dequant_op_out = matched.at("dequant_op_out");
// obtain input_scale and weight_scale
// obtain weight_scale from max_range
auto* scope = quantized_op->stmt()->op()->scope();
auto& valid_places = quantized_op->stmt()->op()->valid_places();
int bit_length = quantized_op->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1);
float input_scale =
quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
float max_range = dequant_op->stmt()->op_info()->GetAttr<float>("max_range");
float whole_weight_scale =
static_cast<float>(range * range) / max_range / range;
// max_range = range * range / max(abs(weight))
// weight_scale = range * range / (range * range / max(abs(weight))) / range
// As: max_range = range * range / max(abs(weight))
// So: whole_weight_scale
// = range * range / (range * range / max(abs(weight))) / range
// = max(abs(weight)) / range
// set op desc
......@@ -153,7 +163,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
// Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
// be Cout.
weight_scale_size = quantized_weight_t->dims()[0];
} else if (quantized_op_type_ == "mul") {
} else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") {
op_desc.SetInput("X", {quantized_op_input->arg()->name});
op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
// Fc weight: Cin * Cout, the weight_scale_size should be Cout.
......@@ -163,7 +173,6 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
weight_scale.push_back(whole_weight_scale);
}
op_desc.SetAttr("enable_int8", true);
op_desc.SetAttr("input_scale", input_scale);
op_desc.SetAttr("weight_scale", weight_scale);
// change the weight from the float type to int8 type.
......@@ -209,6 +218,7 @@ void ChannelWiseDequantOpFuser::BuildPattern() {
->assert_is_op_output(quantized_op_type_)
->assert_is_op_input(dequant_op_type, "X")
->AsIntermediate();
// The scale var_node of input activation is deleted in DeleteQuantOpFuser
auto* dequant_op_channel_scale = VarNode("dequant_op_channel_scale")
->assert_is_op_input(dequant_op_type)
->AsIntermediate();
......@@ -237,11 +247,9 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
auto* dequant_op = matched.at("dequant_op");
auto* dequant_op_out = matched.at("dequant_op_out");
// obtain input_scale and weight_scale
// obtain input weight_scale from fake_dequant op
auto* scope = quantized_op->stmt()->op()->scope();
auto& valid_places = quantized_op->stmt()->op()->valid_places();
float input_scale =
quantized_op->stmt()->op_info()->GetAttr<float>("input_scale");
std::vector<float> weight_scale;
std::vector<int> quant_bits =
......@@ -258,11 +266,15 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
// set op desc
cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
if (quantized_op_type_ == "conv2d" ||
quantized_op_type_ == "depthwise_conv2d") {
op_desc.SetInput("Input", {quantized_op_input->arg()->name});
op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
} else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") {
op_desc.SetInput("X", {quantized_op_input->arg()->name});
op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
}
op_desc.SetAttr("enable_int8", true);
op_desc.SetAttr("input_scale", input_scale);
op_desc.SetAttr("weight_scale", weight_scale);
// change the weight from the float type to int8 type.
......@@ -297,14 +309,12 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
void DeleteQuantDequantOpFuser::BuildPattern() {
std::string quant_dequant_op_type =
"fake_quantize_dequantize_moving_average_abs_max";
if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") {
auto* input_scale_node =
VarNode("input_scale_node")
->assert_is_op_input(quant_dequant_op_type, "InScale");
auto* input_act_node = VarNode("input_act_node")
->assert_is_op_input(quant_dequant_op_type, "X");
auto* quant_dequant_node =
OpNode("quant_dequant_node", quant_dequant_op_type)
auto* input_act_node =
VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X");
auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type)
->assert_is_op(quant_dequant_op_type);
auto* output_scale_node =
VarNode("output_scale_node")
......@@ -312,77 +322,23 @@ void DeleteQuantDequantOpFuser::BuildPattern() {
auto* output_act_node =
VarNode("output_act_node")
->assert_is_op_output(quant_dequant_op_type, "Out");
auto* quantized_node = OpNode("quantized_node", quantized_op_type_)
->assert_is_op(quantized_op_type_);
quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
output_scale_node->LinksFrom({quant_dequant_node});
output_act_node->LinksFrom({quant_dequant_node});
quantized_node->LinksFrom({output_act_node});
} else if (quantized_op_type_ == "elementwise_add") {
auto* input_scale_left_node =
VarNode("input_scale_left_node")
->assert_is_op_input(quant_dequant_op_type, "InScale");
auto* input_act_left_node =
VarNode("input_act_left_node")
->assert_is_op_input(quant_dequant_op_type, "X");
auto* quant_dequant_left_node =
OpNode("quant_dequant_left_node", quant_dequant_op_type)
->assert_is_op(quant_dequant_op_type);
auto* output_scale_left_node =
VarNode("output_scale_left_node")
->assert_is_op_output(quant_dequant_op_type, "OutScale");
auto* output_act_left_node =
VarNode("output_act_left_node")
->assert_is_op_output(quant_dequant_op_type, "Out")
->assert_is_op_input(quantized_op_type_, "X");
quant_dequant_left_node->LinksFrom(
{input_scale_left_node, input_act_left_node});
output_scale_left_node->LinksFrom({quant_dequant_left_node});
output_act_left_node->LinksFrom({quant_dequant_left_node});
auto* input_scale_right_node =
VarNode("input_scale_right_node")
->assert_is_op_input(quant_dequant_op_type, "InScale");
auto* input_act_right_node =
VarNode("input_act_right_node")
->assert_is_op_input(quant_dequant_op_type, "X");
auto* quant_dequant_right_node =
OpNode("quant_dequant_right_node", quant_dequant_op_type)
->assert_is_op(quant_dequant_op_type);
auto* output_scale_right_node =
VarNode("output_scale_right_node")
->assert_is_op_output(quant_dequant_op_type, "OutScale");
auto* output_act_right_node =
VarNode("output_act_right_node")
->assert_is_op_output(quant_dequant_op_type, "Out")
->assert_is_op_input(quantized_op_type_, "Y");
quant_dequant_right_node->LinksFrom(
{input_scale_right_node, input_act_right_node});
output_scale_right_node->LinksFrom({quant_dequant_right_node});
output_act_right_node->LinksFrom({quant_dequant_right_node});
auto* quantized_node = OpNode("quantized_node", quantized_op_type_)
->assert_is_op(quantized_op_type_);
quantized_node->LinksFrom({output_act_left_node, output_act_right_node});
} else {
LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_;
}
VLOG(4) << "DeleteQuantDequantOpFuser BuildPattern op_type:"
<< quantized_op_type_;
}
void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") {
auto* input_scale_node = matched.at("input_scale_node");
auto* input_act_node = matched.at("input_act_node");
auto* quant_dequant_node = matched.at("quant_dequant_node");
auto* output_scale_node = matched.at("output_scale_node");
auto* output_act_node = matched.at("output_act_node");
auto* quantized_node = matched.at("quantized_node");
auto input_act_name = input_act_node->arg()->name;
auto output_act_name = output_act_node->arg()->name;
// obtain values, save values and relink node
// Get scale value from scale var node
int bit_length =
quant_dequant_node->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1);
......@@ -391,73 +347,27 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
->GetMutable<lite::Tensor>();
float scale_value = scale_tensor->data<float>()[0] / range;
auto* op_desc = quantized_node->stmt()->mutable_op_info();
op_desc->SetAttr<int>("bit_length", bit_length);
op_desc->SetAttr<float>("input_scale", scale_value);
op_desc->SetInput("X", {input_act_node->arg()->name});
IR_NODE_LINK_TO(input_act_node, quantized_node)
auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
// delete nodes and edges
std::unordered_set<const Node*> nodes2rm = {input_scale_node,
quant_dequant_node,
output_scale_node,
output_act_node};
GraphSafeRemoveNodes(graph, nodes2rm);
} else if (quantized_op_type_ == "elementwise_add") {
auto* input_scale_left_node = matched.at("input_scale_left_node");
auto* input_act_left_node = matched.at("input_act_left_node");
auto* quant_dequant_left_node = matched.at("quant_dequant_left_node");
auto* output_scale_left_node = matched.at("output_scale_left_node");
auto* output_act_left_node = matched.at("output_act_left_node");
auto* input_scale_right_node = matched.at("input_scale_right_node");
auto* input_act_right_node = matched.at("input_act_right_node");
auto* quant_dequant_right_node = matched.at("quant_dequant_right_node");
auto* output_scale_right_node = matched.at("output_scale_right_node");
auto* output_act_right_node = matched.at("output_act_right_node");
auto* quantized_node = matched.at("quantized_node");
// obtain values, save values and relink node
int bit_length =
quant_dequant_left_node->stmt()->op_info()->GetAttr<int>("bit_length");
int range = ((1 << (bit_length - 1)) - 1);
auto* scope = quant_dequant_left_node->stmt()->op()->scope();
auto* left_scale_tensor =
scope->FindVar(output_scale_left_node->arg()->name)
->GetMutable<lite::Tensor>();
float left_scale_value = left_scale_tensor->data<float>()[0] / range;
auto* right_scale_tensor =
scope->FindVar(output_scale_right_node->arg()->name)
->GetMutable<lite::Tensor>();
float right_scale_value = right_scale_tensor->data<float>()[0] / range;
auto* op_desc = quantized_node->stmt()->mutable_op_info();
op_desc->SetAttr<int>("bit_length", bit_length);
op_desc->SetAttr<float>("x_input_scale", left_scale_value);
op_desc->SetAttr<float>("y_input_scale", right_scale_value);
op_desc->SetInput("X", {input_act_left_node->arg()->name});
op_desc->SetInput("Y", {input_act_right_node->arg()->name});
IR_NODE_LINK_TO(input_act_left_node, quantized_node)
IR_NODE_LINK_TO(input_act_right_node, quantized_node)
auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
auto quantized_nodes = output_act_node->outlinks;
for (auto* quantized_node : quantized_nodes) {
// Save quantization info in op_info attr
auto op_info = *quantized_node->stmt()->op_info();
std::string argname;
int index;
op_info.GetInputArgname(output_act_name, &argname);
op_info.GetInputIndex(output_act_name, &index);
op_info.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
scale_value);
op_info.SetAttr<float>("input_scale", scale_value); // Save it for now
op_info.SetAttr<int>("bit_length", bit_length);
op_info.UpdateAllInputs(output_act_name, input_act_name);
quantized_node->stmt()->ResetOp(op_info, graph->valid_places());
IR_NODE_LINK_TO(input_act_node, quantized_node);
}
// delete nodes and edges
std::unordered_set<const Node*> nodes2rm = {input_scale_left_node,
quant_dequant_left_node,
output_scale_left_node,
output_act_left_node,
input_scale_right_node,
quant_dequant_right_node,
output_scale_right_node,
output_act_right_node};
std::unordered_set<const Node*> nodes2rm = {
input_scale_node, quant_dequant_node, output_scale_node, output_act_node};
GraphSafeRemoveNodes(graph, nodes2rm);
} else {
LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_;
}
}
cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
......
......@@ -87,24 +87,16 @@ class ChannelWiseDequantOpFuser : public FuseBase {
};
/* The pattern like "fake_quantize_dequantize_moving_average_abs_max +
* pooled/elementwise_add" can be deteted by this fuser. The fuser
* extract the input_scale form fake_quant_dequant_op and save into
* the quantized_op. Besides, the fuser delete fake_quant_dequant_op in
* the graph.
* quantized_op" can be deteted by this fuser. The fuser modifies the input
* scale for the quantized_op and deletes the fake_quant_dequant_op.
*/
class DeleteQuantDequantOpFuser : public FuseBase {
public:
explicit DeleteQuantDequantOpFuser(const std::string& quantized_op_type)
: quantized_op_type_(quantized_op_type) {}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
private:
std::string quantized_op_type_{};
};
} // namespace fusion
......
......@@ -225,6 +225,32 @@ class OpInfo : public cpp::OpDesc {
return false;
}
// For the input variable name, find the index of the corresponding
// input argname
bool GetInputIndex(const std::string &value_name, int *out) const {
for (auto &item : inputs_) {
auto it = std::find(item.second.begin(), item.second.end(), value_name);
if (it != item.second.end()) {
*out = it - item.second.begin();
return true;
}
}
return false;
}
// For the output variable name, find the index of the corresponding
// output argname
bool GetOutputIndex(const std::string &value_name, int *out) const {
for (auto &item : outputs_) {
auto it = std::find(item.second.begin(), item.second.end(), value_name);
if (it != item.second.end()) {
*out = it - item.second.begin();
return true;
}
}
return false;
}
void UpdateAllInputs(const std::string &from, const std::string &to) {
for (auto &item : inputs_) {
for (auto &var : item.second) {
......
......@@ -18,6 +18,7 @@
* of each kernel.
*/
#pragma once
#include <cmath>
#include <string>
#include <vector>
#include "lite/core/program.h"
......
......@@ -20,7 +20,7 @@
#include "lite/operators/conditional_block_op.h"
#include "lite/operators/subgraph_op.h"
#include "lite/operators/while_op.h"
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
#include "lite/core/profile/precision_profiler.h"
#endif
......@@ -136,12 +136,10 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
}
void RuntimeProgram::Run() {
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
std::string precision_profiler_summary =
inst_precision_profiler.GetSummaryHeader();
#endif
#endif
for (auto& inst : instructions_) {
......@@ -149,21 +147,19 @@ void RuntimeProgram::Run() {
if (inst.is_feed_fetch_op()) continue;
#endif
inst.Run();
#ifdef LITE_WITH_PROFILE
#ifdef LITE_WITH_PRECISION_PROFILE
#ifndef LITE_WITH_FPGA
precision_profiler_summary +=
inst_precision_profiler.GetInstPrecision(&inst);
#endif
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
}
#ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
#endif
#ifdef LITE_WITH_PRECISION_PROFILE
LOG(INFO) << "\n" << precision_profiler_summary;
#endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE
#endif
}
void Program::Build(const cpp::ProgramDesc& prog) {
......
# Introduction
我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。
你可以通过book库中的
[文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html)
[源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)
进一步了解“波士顿房价预测”这一任务的定义及其建模过程,
其使用线性回归(Linear Regression)
模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。
注:这是一篇使用C++ API做模型训练的教程,其他API暂时不支持训练功能。
# Requirements
- 一部安卓手机,用于运行训练程序
- 装了Paddle (version: 1.7.0) 的python
# Quick start
## Step1 build paddle-lite
请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为:
```shell
## 配置环境
wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz --no-check-certificate
tar xzf cmake-3.10.3-Linux-x86_64.tar.gz
export PATH=${PWD}'/cmake-3.10.3-Linux-x86_64/bin':$PATH
wget https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
unzip android-ndk-r17c-linux-x86_64.zip
export NDK_ROOT=/opt/android-ndk-r17c
## 编译
git clone https://github.com/PaddlePaddle/Paddle-Lite.git
cd Paddle-Lite
./lite/tools/build.sh \
--arm_os=android \
--arm_abi=armv7 \
--build_extra=ON \
--arm_lang=gcc \
--android_stl=c++_static \
--build_train=ON full_publish
```
产物:
```shell
Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so
```
## Step2 编译lr_trainer
```shell
cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/
sh run_build.sh /path/to/your/Paddle-Lite/build.lite.android.armv7.gcc/ /path/to/your/android-ndk-r17c
```
产物:
```shell
bin/
`-- demo_trainer
```
## Step3 download model and run it!
在你的笔记本电脑上,用usb连接到手机,开启开发者模式,在任意目录下执行:
```shell
local_path=/data/local/tmp/linear_regression
adb shell "mkdir "${local_path}
# download model and push to mobile
wget http://paddle-tar.bj.bcebos.com/paddle-lite/lite_lr_model.tar.gz
tar -zxvf lite_lr_model.tar.gz
adb push lite_lr_model/housing.data ${local_path}
adb push lite_lr_model/model_dir ${local_path}
# push lib and executable file to moblie
adb push libpaddle_full_api_shared.so ${local_path}
adb push demo_trainer ${local_path}
adb shell chmod +x ${local_path}/demo_trainer
# run it!
adb shell "export LD_LIBRARY_PATH="${local_path}" && export LIBRARY_PATH="${local_path}" && cd "${local_path}" && ./demo_trainer true"
```
期望结果:
```
sample 0: Loss: 564.317
sample 1: Loss: 463.9
sample 2: Loss: 1197.54
sample 3: Loss: 1093.83
sample 4: Loss: 1282.76
sample 5: Loss: 792.097
sample 6: Loss: 491.776
sample 7: Loss: 698.496
sample 8: Loss: 248.445
sample 9: Loss: 325.135
```
# 更多细节
上面提到的模型是直接下载得到的,如果你想自己生成,可以执行以下命令:
```shell
git clone https://github.com/PaddlePaddle/Paddle-Lite.git
cd Paddle-Lite/lite/demo/cxx/train_demo/
python train.py --save_model
```
产物:
```shell
model_dir/
|-- fc_0.b_0
|-- fc_0.w_0
|-- learning_rate_0
`-- __model__
md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d
```
如果你想生成自己的模型用于训练,可以参考`train.py`中保存模型的方式。
# 与Paddle训练结果做校对
## 前10个Loss值
为了验证paddle与lite的一致性,我们控制模型参数一致、数据一致、batch size = 1的情况下,训练10个batch, 记录了二者的loss值。
python + paddle 命令:
```shell
fluid train.py --num_steps=10 --batch_size=1
```
python + paddle 结果:
```shell
Train cost, Step 0, Cost 564.317017
Train cost, Step 1, Cost 463.900238
Train cost, Step 2, Cost 1197.537354
Train cost, Step 3, Cost 1093.833008
Train cost, Step 4, Cost 1282.760254
Train cost, Step 5, Cost 792.097351
Train cost, Step 6, Cost 491.775848
Train cost, Step 7, Cost 698.496033
Train cost, Step 8, Cost 248.444885
Train cost, Step 9, Cost 325.135132
```
c++ 与 paddle-lite命令:
```
./demo_trainer true
```
c++ 与 paddle-lite结果:
```
sample 0: Loss: 564.317
sample 1: Loss: 463.9
sample 2: Loss: 1197.54
sample 3: Loss: 1093.83
sample 4: Loss: 1282.76
sample 5: Loss: 792.097
sample 6: Loss: 491.776
sample 7: Loss: 698.496
sample 8: Loss: 248.445
sample 9: Loss: 325.135
```
## Loss 曲线
控制训练时的batch size为20,每个epoch对训练数据做全局shuffle,训练100个epoch后,paddle和lite的loss曲线对比如下。
![lr_loss](image/lr_loss.png)
如果想复现上述效果,paddle+python的运行命令为:
```
git clone https://github.com/PaddlePaddle/book.git
cd book/01.fit_a_line
python train.py
```
lite + c++的运行命令为:
```
./demo_trainer false
```
cmake_minimum_required(VERSION 2.8)
set (CMAKE_CXX_STANDARD 11)
# Project's name
if(NOT DEFINED LITE_ROOT)
message(FATAL_ERROR "please set LITE_ROOT with
-DLITE_ROOT=/path/to/your/build.lite.android.armv7.gcc/")
endif()
project(demo_trainer)
# Set the output folder where your program will be created
set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin)
set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
# The following folder will be included
include_directories("include")
include_directories("${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/include")
add_executable(demo_trainer ${PROJECT_SOURCE_DIR}/demo_trainer.cc ${PROJECT_SOURCE_DIR}/data_reader.cc)
TARGET_LINK_LIBRARIES(demo_trainer
"${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so")
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "include/data_reader.h"
#include <limits>
using std::string;
using std::vector;
int FEATURE_NUM = 13;
float rate = 0.8;
int get_samples(string line, vector<float>* feature, float* label) {
std::istringstream reader(line);
std::vector<float> numbers;
do {
// read as many numbers as possible.
for (float number; reader >> number;) {
numbers.push_back(number);
}
// consume and discard token from stream.
if (reader.fail()) {
reader.clear();
std::string token;
reader >> token;
}
} while (!reader.eof());
assert(numbers.size() == FEATURE_NUM + 1);
for (int i = 0; i < FEATURE_NUM; i++) {
feature->push_back(numbers[i]);
}
*label = numbers[FEATURE_NUM];
return 0;
}
int normalize(const vector<vector<float>>& origin_features,
vector<vector<float>>* features,
float rate) {
int inf = std::numeric_limits<int>::max();
vector<float> min_vec(FEATURE_NUM, static_cast<float>(inf));
vector<float> max_vec(FEATURE_NUM, -(static_cast<float>(inf)));
vector<float> sum_vec(FEATURE_NUM, 0);
vector<float> avg_vec(FEATURE_NUM, 0);
for (int i = 0; i < origin_features.size(); i++) {
for (int j = 0; j < FEATURE_NUM; j++) {
min_vec[j] = min(min_vec[j], origin_features[i][j]);
max_vec[j] = max(max_vec[j], origin_features[i][j]);
sum_vec[j] += origin_features[i][j];
}
}
for (int i = 0; i < FEATURE_NUM; i++) {
avg_vec[i] = sum_vec[i] / origin_features.size();
}
for (int i = 0; i < origin_features.size() * rate - 1; i++) {
vector<float> feat;
for (int j = 0; j < FEATURE_NUM; j++) {
feat.push_back((origin_features[i][j] - avg_vec[j]) /
(max_vec[j] - min_vec[j]));
}
features->push_back(feat);
}
}
int read_samples(const string fname,
vector<vector<float>>* features,
vector<float>* labels) {
fstream fin;
fin.open(fname);
if (!static_cast<bool>(fin)) {
return 1;
}
vector<vector<float>> origin_features;
vector<string> lines;
string line;
while (getline(fin, line)) {
lines.push_back(line);
}
fin.close();
for (int i = 0; i < lines.size(); i++) {
vector<float> feat;
float lbl = 0;
get_samples(lines[i], &feat, &lbl);
origin_features.push_back(feat);
if (i < lines.size() * rate - 1) {
labels->push_back(lbl);
}
}
cout << "finish read fata" << endl;
normalize(origin_features, features, rate);
assert(features->size() == labels->size());
return 0;
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <math.h>
#include <algorithm>
#include <iomanip>
#include <iostream>
#include <vector>
#include "include/data_reader.h"
#include "paddle_api.h" // NOLINT
using namespace paddle::lite_api; // NOLINT
class LRModel {
public:
void InitModel() {
// 1. Set CxxConfig
CxxConfig config;
config.set_model_dir("model_dir");
std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
config.set_valid_places(valid_places);
predictor_ = CreatePaddlePredictor<CxxConfig>(config);
}
float Predict(const vector<vector<float>>& features,
const vector<float>& labels) {
// Create Tensor
assert(features.size() == labels.size());
int batch_size = features.size();
std::unique_ptr<Tensor> input_tensor(std::move(predictor_->GetInput(0)));
input_tensor->Resize(shape_t({batch_size, FEATURE_NUM}));
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < batch_size; i++) {
for (int j = 0; j < FEATURE_NUM; j++) {
data[FEATURE_NUM * i + j] = features[i][j];
}
}
std::unique_ptr<Tensor> y_tensor(std::move(predictor_->GetInput(1)));
y_tensor->Resize(shape_t({batch_size, 1}));
auto* y_data = y_tensor->mutable_data<float>();
for (int i = 0; i < batch_size; i++) {
y_data[i] = labels[i];
}
predictor_->Run();
std::unique_ptr<const Tensor> output_tensor(
std::move(predictor_->GetOutput(0)));
return output_tensor->data<float>()[0];
}
private:
std::shared_ptr<PaddlePredictor> predictor_;
};
int shuffle(vector<vector<float>>* features, vector<float>* labels) {
assert(features->size() == labels->size());
vector<int> index;
for (int i = 0; i < features->size(); i++) {
index.push_back(i);
}
random_shuffle(index.begin(), index.end());
vector<vector<float>> tmp_features;
vector<float> tmp_labels;
for (int i = 0; i < features->size(); i++) {
tmp_features.push_back((*features)[index[i]]);
tmp_labels.push_back((*labels)[index[i]]);
}
for (int i = 0; i < features->size(); i++) {
for (int j = 0; j < FEATURE_NUM; j++) {
(*features)[i][j] = tmp_features[i][j];
}
(*labels)[i] = tmp_labels[i];
}
return 0;
}
int main(int argc, char* argv[]) {
if (argc < 2) {
cerr << "usage: ./demo_trainer is_small" << endl;
cerr << " if is_small is true, the batch size is set to 1, " << endl;
cerr << " and it will only runs for 10 steps." << endl;
return 1;
}
string is_small = argv[1];
vector<vector<float>> features;
vector<float> labels;
read_samples("housing.data", &features, &labels);
cout << "sample count: " << features.size() << " " << endl;
std::shared_ptr<LRModel> local_model(new LRModel());
local_model->InitModel();
if (is_small == "true") {
cout << "small mode" << endl;
for (int i; i < 10; i++) {
vector<vector<float>> batch_feature;
vector<float> batch_label;
batch_feature.push_back(features[i]);
batch_label.push_back(labels[i]);
auto loss = local_model->Predict(batch_feature, batch_label);
cout << "sample " << i << ": " << loss << endl;
}
} else if (is_small == "false") {
// shuffle
cout << "full model" << endl;
int epoch = 100;
int batch_size = 20;
int step = 0;
for (int i; i < epoch; i++) {
shuffle(&features, &labels);
for (int j = 0;
j < ceil(static_cast<float>(features.size()) / batch_size);
j++) {
int start_idx = j * batch_size;
int end_idx =
min((j + 1) * batch_size, static_cast<int>(features.size()));
auto batch_feature = vector<vector<float>>(features.begin() + start_idx,
features.begin() + end_idx);
auto batch_label =
vector<float>(labels.begin() + start_idx, labels.begin() + end_idx);
auto loss = local_model->Predict(batch_feature, batch_label);
if (step % 10 == 0) {
std::cout << "batch: " << i << ", step: " << step
<< ", Loss: " << loss << endl;
}
step += 1;
}
}
} else {
cerr << "wrong arg for is_small: " << is_small << endl;
}
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <assert.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
using std::string;
using std::vector;
using std::cerr;
using std::cout;
using std::endl;
using std::min;
using std::max;
using std::fstream;
extern int FEATURE_NUM;
int get_samples(string line, const vector<float>& feature, float* label);
int read_samples(const string fname,
vector<vector<float>>* features,
vector<float>* labels);
rm -rf build
mkdir build
cd build
LITE_ROOT=$1
NDK_ROOT=$2
cmake .. \
-DLITE_ROOT=${LITE_ROOT} \
-DNDK_ROOT=${NDK_ROOT} \
-DCMAKE_TOOLCHAIN_FILE=${NDK_ROOT}/build/cmake/android.toolchain.cmake \
-DANDROID_TOOLCHAIN=gcc \
-DANDROID_ABI="armeabi-v7a" \
-DANDROID_PLATFORM=android-23 \
-DANDROID=true \
-DANDROID_STL=c++_static
make
cd ..
# ./bin/demo_trainer
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import sys
import argparse
import math
import numpy
import paddle
import paddle.fluid as fluid
def parse_args():
parser = argparse.ArgumentParser("fit_a_line")
parser.add_argument(
'--save_model',
action='store_true',
help="Whether to save main program")
parser.add_argument(
'--num_steps',
type=int,
default=1000000000000,
help="train steps")
parser.add_argument(
'--num_epochs', type=int, default=100, help="number of epochs.")
parser.add_argument(
'--batch_size', type=int, default=20, help="batch size.")
parser.add_argument(
'--shuffle',
action='store_true',
help="Whether to shuffle train data.")
args = parser.parse_args()
return args
# For training test cost
def train_test(executor, program, reader, feeder, fetch_list):
accumulated = 1 * [0]
count = 0
for data_test in reader():
outs = executor.run(
program=program, feed=feeder.feed(data_test), fetch_list=fetch_list)
accumulated = [x_c[0] + x_c[1][0] for x_c in zip(accumulated, outs)]
count += 1
return [x_d / count for x_d in accumulated]
def main():
if args.shuffle:
print("doing shuffle")
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.uci_housing.train(), buf_size=500),
batch_size=args.batch_size)
else:
train_reader = paddle.batch(
paddle.dataset.uci_housing.train(), batch_size=args.batch_size)
# feature vector of length 13
x = fluid.data(name='x', shape=[None, 13], dtype='float32')
y = fluid.data(name='y', shape=[None, 1], dtype='float32')
main_program = fluid.default_main_program()
startup_program = fluid.default_startup_program()
main_program.random_seed = 90
startup_program.random_seed = 90
y_predict = fluid.layers.fc(input=x, size=1, act=None)
cost = fluid.layers.square_error_cost(input=y_predict, label=y)
avg_loss = fluid.layers.mean(cost)
test_program = main_program.clone(for_test=True)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_loss)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
num_epochs = args.num_epochs
# main train loop.
feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
exe.run(startup_program)
if args.save_model:
fluid.io.save_persistables(exe, "model_dir")
# add feed and fetch op
feeded_var_names = ['x', 'y']
fetch_var_names = ['mean_0.tmp_0']
fluid.io.prepend_feed_ops(main_program, feeded_var_names)
fluid.io.append_fetch_ops(main_program, fetch_var_names)
with open("model_dir/__model__", "wb") as f:
f.write(main_program.desc.serialize_to_string())
with open("debug_main_program", "w") as f:
f.write(str(main_program))
print("train model saved to model_dir")
return
train_prompt = "Train cost"
step = 0
for pass_id in range(num_epochs):
for data_train in train_reader():
avg_loss_value, = exe.run(
main_program,
feed=feeder.feed(data_train),
fetch_list=[avg_loss])
print("%s, Step %d, Cost %f" %
(train_prompt, step, avg_loss_value[0]))
if step == args.num_steps - 1:
return
step += 1
if math.isnan(float(avg_loss_value[0])):
sys.exit("got NaN loss, training failed.")
if __name__ == '__main__':
args = parse_args()
main()
......@@ -106,13 +106,12 @@ add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math
# 4. training kernels
add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
if(LITE_WITH_TRAIN)
add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
endif()
add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
......
......@@ -30,6 +30,8 @@ lite_cc_library(subgraph_bridge_conv_transpose_op_bm SRCS conv_transpose_op.cc D
lite_cc_library(subgraph_bridge_reduce_full_op_bm SRCS reduce_full_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_squeeze_op_bm SRCS squeeze_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_cast_op_bm SRCS cast_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_fill_constant_op_bm SRCS fill_constant_op.cc DEPS ${bm_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_assign_value_op_bm SRCS assign_value_op.cc DEPS ${bm_subgraph_bridge_deps})
set(bm_subgraph_bridges
subgraph_bridge_registry
......@@ -58,4 +60,6 @@ set(bm_subgraph_bridges
subgraph_bridge_reduce_full_op_bm
subgraph_bridge_squeeze_op_bm
subgraph_bridge_cast_op_bm
subgraph_bridge_fill_constant_op_bm
subgraph_bridge_assign_value_op_bm
CACHE INTERNAL "bm_subgraph_bridges")
......@@ -13,6 +13,7 @@
// limitations under the License.
#include <bmcompiler_if.h>
#include <bmcompiler_if_lite.h>
#include <bmcompiler_op_code.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/npu/bridges/registry.h"
......@@ -35,16 +36,14 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims();
const int64_t* x_shape_data = const_cast<const int64_t*>(&x_dims.data()[0]);
const int64_t* output_shape_data =
const_cast<const int64_t*>(&output_dims.data()[0]);
bool x_is_const = !graph->HasNode(x_var_name);
std::vector<int32_t> i_x_shape_data(x_dims.size());
std::vector<int32_t> i_output_shape_data(output_dims.size());
for (size_t i = 0; i < x_dims.size(); i++) {
i_x_shape_data[i] = static_cast<int>(x_shape_data[i]);
i_x_shape_data[i] = x_dims[i];
}
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
i_output_shape_data[i] = output_dims[i];
}
float alpha = 0.f;
int active_type_id = 0;
......@@ -59,6 +58,15 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
LOG(FATAL) << "[BM] unsupport act type";
return FAILED;
}
const float* x_data = const_cast<const float*>(x->mutable_data<float>());
if (x_is_const) {
bm_add_const_tensor(graph->GetCompilerHandle(),
static_cast<const char*>(x_var_name.c_str()),
const_cast<const int*>(&i_x_shape_data[0]),
x_dims.size(),
static_cast<bm_data_type_t>(DTYPE_FP32),
static_cast<const void*>(x_data));
}
if (op_type == "relu" || op_type == "leaky_relu") {
add_relu_layer(graph->GetCompilerHandle(),
const_cast<const int*>(&i_x_shape_data[0]),
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_defs.h>
#include <bmcompiler_if.h>
#include <bmcompiler_if_lite.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace bm {
int AssignValueConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto scope = op->scope();
auto op_info = op->op_info();
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims();
std::vector<int32_t> i_output_shape_data(output_dims.size());
int buffer_size = 1;
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_dims[i]);
buffer_size *= i_output_shape_data[i];
}
auto fp32_values = op_info->GetAttr<std::vector<float>>("fp32_values");
float* assign_data =
reinterpret_cast<float*>(malloc(buffer_size * sizeof(float)));
CHECK(assign_data != nullptr);
CHECK_EQ(buffer_size, fp32_values.size());
bm_add_const_tensor(graph->GetCompilerHandle(),
static_cast<const char*>(output_var_name.c_str()),
const_cast<const int*>(i_output_shape_data.data()),
output_dims.size(),
static_cast<bm_data_type_t>(DTYPE_FP32),
reinterpret_cast<const void*>(assign_data));
graph->AddNode(output_var_name);
return SUCCESS;
}
} // namespace bm
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(assign_value,
kBM,
paddle::lite::subgraph::bm::AssignValueConverter);
......@@ -39,6 +39,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto filter_var_name = op_info->Input("Filter").front();
auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
auto filter_dims = filter->dims();
CHECK_EQ(input_dims.size(), 4);
CHECK_EQ(output_dims.size(), 4);
CHECK_EQ(filter_dims.size(), 4);
......@@ -90,6 +91,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
dilations[1],
static_cast<int>(has_bias));
graph->AddNode(output_var_name);
LOG(INFO) << output_var_name << input_dims << " " << output_dims;
return SUCCESS;
}
......
......@@ -65,6 +65,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto output_dims = output->dims();
const int64_t* output_shape_data =
const_cast<const int64_t*>(&output_dims.data()[0]);
LOG(INFO) << x_dims << " " << output_dims;
std::vector<int32_t> i_output_shape_data(output_dims.size());
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <bmcompiler_defs.h>
#include <bmcompiler_if.h>
#include <bmcompiler_if_lite.h>
#include "lite/kernels/bm/bridges/graph.h"
#include "lite/kernels/bm/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace bm {
int FillConstantConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto scope = op->scope();
auto op_info = op->op_info();
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
auto output_dims = output->dims();
std::vector<int32_t> i_output_shape_data(output_dims.size());
int buffer_size = 1;
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_dims[i]);
}
float* const_data =
reinterpret_cast<float*>(malloc(buffer_size * sizeof(float)));
CHECK(const_data != nullptr);
auto value = op_info->GetAttr<float>("value");
for (size_t i = 0; i < buffer_size; i++) {
const_data[i] = value;
}
bm_add_const_tensor(graph->GetCompilerHandle(),
static_cast<const char*>(output_var_name.c_str()),
const_cast<const int*>(i_output_shape_data.data()),
output_dims.size(),
static_cast<bm_data_type_t>(DTYPE_FP32),
reinterpret_cast<const void*>(const_data));
graph->AddNode(output_var_name);
return SUCCESS;
}
} // namespace bm
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(fill_constant,
kBM,
paddle::lite::subgraph::bm::FillConstantConverter);
......@@ -29,7 +29,6 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto unique_op_name = lite::subgraph::bm::UniqueName(op_type);
// only support y is const
// input
auto x_var_name = op_info->Input("X").front();
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
......@@ -61,6 +60,12 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto y_var_name = op_info->Input("Y").front();
auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
auto y_dims = y->dims();
bool y_is_const = !graph->HasNode(y_var_name);
CHECK_EQ(y_dims.size(), 2);
int i_y_shape_data[2];
for (size_t i = 0; i < 2; i++) {
i_y_shape_data[i] = y_dims[i];
}
// output
auto output_var_name = op_info->Output("Out").front();
auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
......@@ -71,6 +76,7 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
for (size_t i = 0; i < output_dims.size(); i++) {
i_output_shape_data[i] = static_cast<int>(output_shape_data[i]);
}
if (y_is_const) {
add_fc_layer(graph->GetCompilerHandle(),
const_cast<const int*>(&i_x_reshape_shape_data[0]),
2,
......@@ -85,6 +91,24 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
nullptr,
0,
0);
} else {
add_fc_weight_layer(
graph->GetCompilerHandle(),
const_cast<const int*>(&i_x_reshape_shape_data[0]),
2,
static_cast<const char*>(unique_op_reshape_name.c_str()),
const_cast<const int*>(&i_output_shape_data[0]),
output_dims.size(),
static_cast<const char*>(output_var_name.c_str()),
static_cast<const char*>(unique_op_name.c_str()),
const_cast<const int*>(&i_y_shape_data[0]),
2,
static_cast<const char*>(y_var_name.c_str()),
i_x_reshape_shape_data[1],
nullptr,
0,
0);
}
graph->AddNode(output_var_name);
return SUCCESS;
}
......
......@@ -51,3 +51,5 @@ USE_SUBGRAPH_BRIDGE(reduce_mean, kBM);
USE_SUBGRAPH_BRIDGE(squeeze, kBM);
USE_SUBGRAPH_BRIDGE(squeeze2, kBM);
USE_SUBGRAPH_BRIDGE(cast, kBM);
USE_SUBGRAPH_BRIDGE(fill_constant, kBM);
USE_SUBGRAPH_BRIDGE(assign_value, kBM);
......@@ -35,7 +35,7 @@ int SubgraphEngine::BuildDeviceProgram() {
graph.CreateCompilerHandle();
auto& ctx = this->ctx_->template As<BMContext>();
for (auto& inst : origin_program_) {
auto op = inst.op();
auto op = const_cast<OpLite*>(inst.op());
CHECK(op);
op->CheckShape();
op->InferShape();
......
......@@ -8,6 +8,8 @@ add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_de
add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
add_kernel(abs_compute_cuda CUDA basic SRCS abs_compute.cu DEPS ${lite_kernel_deps})
add_kernel(tanh_compute_cuda CUDA basic SRCS tanh_compute.cu DEPS ${lite_kernel_deps})
add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
......@@ -45,6 +47,8 @@ lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_
#nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
nv_test(nearest_interp_compute_cuda_test SRCS nearest_interp_compute_test.cc DEPS nearest_interp_compute_cuda)
nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_relu_compute_cuda)
nv_test(abs_compute_cuda_test SRCS abs_compute_test.cc DEPS abs_compute_cuda)
nv_test(tanh_compute_cuda_test SRCS tanh_compute_test.cc DEPS tanh_compute_cuda)
nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda)
nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
......@@ -61,7 +65,7 @@ nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc
#nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
#nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
#nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda)
#nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda)
#nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
if(LITE_BUILD_EXTRA)
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/op_registry.h"
#include "lite/kernels/cuda/abs_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T>
__global__ void AbsKernel(const int num, const T* input, T* output);
template <>
__global__ void AbsKernel<float>(const int num,
const float* input,
float* output) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < num) {
output[index] = fabsf(input[index]);
}
}
template <>
__global__ void AbsKernel<double>(const int num,
const double* input,
double* output) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < num) {
output[index] = fabs(input[index]);
}
}
void AbsCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
int num = static_cast<int>(param.X->numel());
auto input = param.X->data<float>();
auto output = param.Out->mutable_data<float>(TARGET(kCUDA));
const int threads = 512;
const int blocks = (num + threads - 1) / threads;
AbsKernel<float><<<blocks, threads, 0, stream>>>(num, input, output);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
abs, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::AbsCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
class AbsCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
void Run() override;
virtual ~AbsCompute() = default;
};
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/cuda/abs_compute.h"
#include <gtest/gtest.h>
#include <cmath>
#include <memory>
#include <utility>
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
TEST(abs, normal) {
AbsCompute abs_kernel;
std::unique_ptr<KernelContext> ctx(new KernelContext);
auto& context = ctx->As<CUDAContext>();
operators::ActivationParam param;
Tensor x, y, x_cpu, y_cpu;
int h = 3, w = 3;
y.Resize({h, w});
x_cpu.Resize({h, w});
y_cpu.Resize({h, w});
auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
float* x_cpu_data = x_cpu.mutable_data<float>();
float* y_cpu_data = y_cpu.mutable_data<float>();
for (int i = 0; i < x_cpu.numel(); i++) {
x_cpu_data[i] = i - 1.5;
}
x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
param.X = &x;
param.Out = &y;
abs_kernel.SetParam(param);
cudaStream_t stream;
cudaStreamCreate(&stream);
context.SetExecStream(stream);
abs_kernel.SetContext(std::move(ctx));
abs_kernel.Launch();
cudaDeviceSynchronize();
CopySync<TARGET(kCUDA)>(
y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
for (int i = 0; i < y.numel(); i++) {
EXPECT_NEAR(y_cpu_data[i], std::fabs(x_cpu_data[i]), 1e-5);
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -152,6 +152,18 @@ void ElementwiseAddComputeNHWC::Run() {
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
}
void ElementwiseSubCompute::Run() {
ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kSUB, false)
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
}
void ElementwiseSubComputeNHWC::Run() {
ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kSUB, false)
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
}
void ElementwiseMulCompute::Run() {
ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, false)
cudaError_t error = cudaGetLastError();
......@@ -204,6 +216,17 @@ REGISTER_LITE_KERNEL(elementwise_add,
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_sub,
kCUDA,
kFloat,
kNCHW,
paddle::lite::kernels::cuda::ElementwiseSubCompute,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_add,
kCUDA,
kFloat,
......@@ -224,6 +247,26 @@ REGISTER_LITE_KERNEL(elementwise_add,
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_sub,
kCUDA,
kFloat,
kNHWC,
paddle::lite::kernels::cuda::ElementwiseSubComputeNHWC,
nhwc_format)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.BindInput("Y",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kCUDA),
PRECISION(kFloat),
DATALAYOUT(kNHWC))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_mul,
kCUDA,
kFloat,
......
......@@ -38,6 +38,24 @@ class ElementwiseAddComputeNHWC
virtual ~ElementwiseAddComputeNHWC() = default;
};
class ElementwiseSubCompute
: public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseParam;
void Run() override;
virtual ~ElementwiseSubCompute() = default;
};
class ElementwiseSubComputeNHWC
: public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
public:
using param_t = operators::ElementwiseParam;
void Run() override;
virtual ~ElementwiseSubComputeNHWC() = default;
};
class ElementwiseMulCompute
: public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public:
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/op_registry.h"
#include "lite/kernels/cuda/tanh_compute.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
template <typename T>
__global__ void TanhKernel(const int num, const T* input, T* output) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < num) {
output[index] = tanh(input[index]);
}
}
void TanhCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->template As<CUDAContext>();
auto stream = ctx.exec_stream();
int num = static_cast<int>(param.X->numel());
auto input = param.X->data<float>();
auto output = param.Out->mutable_data<float>(TARGET(kCUDA));
const int threads = 512;
const int blocks = (num + threads - 1) / threads;
TanhKernel<float><<<blocks, threads, 0, stream>>>(num, input, output);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(
tanh, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::TanhCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
.Finalize();
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cmath>
#include "lite/core/kernel.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
class TanhCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
void Run() override;
virtual ~TanhCompute() = default;
};
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/cuda/tanh_compute.h"
#include <gtest/gtest.h>
#include <cmath>
#include <memory>
#include <utility>
namespace paddle {
namespace lite {
namespace kernels {
namespace cuda {
TEST(tanh, fp32) {
TanhCompute tanh_kernel;
std::unique_ptr<KernelContext> ctx(new KernelContext);
auto& context = ctx->As<CUDAContext>();
operators::ActivationParam param;
Tensor x, y, x_cpu, y_cpu;
int h = 3, w = 3;
y.Resize({h, w});
x_cpu.Resize({h, w});
y_cpu.Resize({h, w});
auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
float* x_cpu_data = x_cpu.mutable_data<float>();
float* y_cpu_data = y_cpu.mutable_data<float>();
for (int i = 0; i < x_cpu.numel(); i++) {
x_cpu_data[i] = i - 1.5;
}
x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
param.X = &x;
param.Out = &y;
tanh_kernel.SetParam(param);
cudaStream_t stream;
cudaStreamCreate(&stream);
context.SetExecStream(stream);
tanh_kernel.SetContext(std::move(ctx));
tanh_kernel.Launch();
cudaDeviceSynchronize();
CopySync<TARGET(kCUDA)>(
y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
for (int i = 0; i < y.numel(); i++) {
EXPECT_NEAR(y_cpu_data[i], tanh(x_cpu_data[i]), 1e-5);
}
}
} // namespace cuda
} // namespace kernels
} // namespace lite
} // namespace paddle
......@@ -5,6 +5,3 @@ add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kerne
add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
#lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
#lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
......@@ -92,6 +92,7 @@ T PolyIoU(const T* box1,
const size_t box_size,
const bool normalized) {
LOG(FATAL) << "PolyIoU not implement.";
return *box1;
}
template <class T>
......@@ -369,6 +370,7 @@ void MulticlassNmsCompute::Run() {
}
} else {
outs->Resize({static_cast<int64_t>(num_kept), out_dim});
outs->mutable_data<float>();
int offset = 0;
int* oindices = nullptr;
for (int i = 0; i < n; ++i) {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/host/multiclass_nms_compute.h"
#include <gtest/gtest.h>
#include <map>
#include <utility>
#include <vector>
namespace paddle {
namespace lite {
namespace kernels {
namespace host {
template <typename dtype>
static bool sort_score_pair_descend(const std::pair<float, dtype>& pair1,
const std::pair<float, dtype>& pair2) {
return pair1.first > pair2.first;
}
template <typename dtype>
void get_max_score_index(const dtype* scores,
int num,
float threshold,
int top_k,
std::vector<std::pair<dtype, int>>* score_index_vec) {
//! Generate index score pairs.
for (int i = 0; i < num; ++i) {
if (scores[i] > threshold) {
score_index_vec->push_back(std::make_pair(scores[i], i));
}
}
//! Sort the score pair according to the scores in descending order
std::stable_sort(score_index_vec->begin(),
score_index_vec->end(),
sort_score_pair_descend<int>);
//! Keep top_k scores if needed.
if (top_k > -1 && top_k < score_index_vec->size()) {
score_index_vec->resize(top_k);
}
}
template <typename dtype>
dtype bbox_size(const dtype* bbox, bool normalized = true) {
if (bbox[2] < bbox[0] || bbox[3] < bbox[1]) {
// If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
return dtype(0.);
} else {
const dtype width = bbox[2] - bbox[0];
const dtype height = bbox[3] - bbox[1];
if (normalized) {
return width * height;
} else {
// If bbox is not within range [0, 1].
return (width + 1) * (height + 1);
}
}
}
template <typename dtype>
dtype jaccard_overlap(const dtype* bbox1, const dtype* bbox2) {
if (bbox2[0] > bbox1[2] || bbox2[2] < bbox1[0] || bbox2[1] > bbox1[3] ||
bbox2[3] < bbox1[1]) {
return dtype(0.);
} else {
const dtype inter_xmin = std::max(bbox1[0], bbox2[0]);
const dtype inter_ymin = std::max(bbox1[1], bbox2[1]);
const dtype inter_xmax = std::min(bbox1[2], bbox2[2]);
const dtype inter_ymax = std::min(bbox1[3], bbox2[3]);
const dtype inter_width = inter_xmax - inter_xmin;
const dtype inter_height = inter_ymax - inter_ymin;
const dtype inter_size = inter_width * inter_height;
const dtype bbox1_size = bbox_size(bbox1);
const dtype bbox2_size = bbox_size(bbox2);
return inter_size / (bbox1_size + bbox2_size - inter_size);
}
}
template <typename dtype>
void apply_nms_fast(const dtype* bboxes,
const dtype* scores,
int num,
float score_threshold,
float nms_threshold,
float eta,
int top_k,
std::vector<int>* indices) {
// Get top_k scores (with corresponding indices).
std::vector<std::pair<dtype, int>> score_index_vec;
get_max_score_index(scores, num, score_threshold, top_k, &score_index_vec);
// Do nms.
float adaptive_threshold = nms_threshold;
indices->clear();
while (score_index_vec.size() != 0) {
const int idx = score_index_vec.front().second;
bool keep = true;
for (int k = 0; k < indices->size(); ++k) {
if (keep) {
const int kept_idx = (*indices)[k];
float overlap =
jaccard_overlap(bboxes + idx * 4, bboxes + kept_idx * 4);
keep = overlap <= adaptive_threshold;
} else {
break;
}
}
if (keep) {
indices->push_back(idx);
}
score_index_vec.erase(score_index_vec.begin());
if (keep && eta < 1 && adaptive_threshold > 0.5) {
adaptive_threshold *= eta;
}
}
}
template <typename dtype>
void multiclass_nms_compute_ref(const operators::MulticlassNmsParam& param,
int class_num,
const std::vector<int>& priors,
bool share_location,
std::vector<float>* result) {
int background_id = param.background_label;
int keep_topk = param.keep_top_k;
int nms_topk = param.nms_top_k;
float conf_thresh = param.score_threshold;
float nms_thresh = param.nms_threshold;
float nms_eta = param.nms_eta;
const dtype* bbox_data = param.bboxes->data<const dtype>();
const dtype* conf_data = param.scores->data<const dtype>();
dtype* out = param.out->mutable_data<dtype>();
(*result).clear();
int num_kept = 0;
std::vector<std::map<int, std::vector<int>>> all_indices;
int64_t conf_offset = 0;
int64_t bbox_offset = 0;
for (int i = 0; i < priors.size(); ++i) {
std::map<int, std::vector<int>> indices;
int num_det = 0;
int num_priors = priors[i];
int conf_idx = class_num * conf_offset;
int bbox_idx =
share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
for (int c = 0; c < class_num; ++c) {
if (c == background_id) {
// Ignore background class
continue;
}
const dtype* cur_conf_data = conf_data + conf_idx + c * num_priors;
const dtype* cur_bbox_data = bbox_data + bbox_idx;
if (!share_location) {
cur_bbox_data += c * num_priors * 4;
}
apply_nms_fast(cur_bbox_data,
cur_conf_data,
num_priors,
conf_thresh,
nms_thresh,
nms_eta,
nms_topk,
&(indices[c]));
num_det += indices[c].size();
}
if (keep_topk > -1 && num_det > keep_topk) {
std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
for (auto it = indices.begin(); it != indices.end(); ++it) {
int label = it->first;
const std::vector<int>& label_indices = it->second;
for (int j = 0; j < label_indices.size(); ++j) {
int idx = label_indices[j];
float score = conf_data[conf_idx + label * num_priors + idx];
score_index_pairs.push_back(
std::make_pair(score, std::make_pair(label, idx)));
}
}
// Keep top k results per image.
std::stable_sort(score_index_pairs.begin(),
score_index_pairs.end(),
sort_score_pair_descend<std::pair<int, int>>);
score_index_pairs.resize(keep_topk);
// Store the new indices.
std::map<int, std::vector<int>> new_indices;
for (int j = 0; j < score_index_pairs.size(); ++j) {
int label = score_index_pairs[j].second.first;
int idx = score_index_pairs[j].second.second;
new_indices[label].push_back(idx);
}
all_indices.push_back(new_indices);
num_kept += keep_topk;
} else {
all_indices.push_back(indices);
num_kept += num_det;
}
conf_offset += num_priors;
bbox_offset += num_priors;
}
if (num_kept == 0) {
(*result).clear();
(*result).resize(1);
(*result)[0] = -1;
return;
} else {
(*result).resize(num_kept * 6);
}
int count = 0;
conf_offset = 0;
bbox_offset = 0;
for (int i = 0; i < priors.size(); ++i) {
int num_priors = priors[i];
int conf_idx = class_num * conf_offset;
int bbox_idx =
share_location ? bbox_offset * 4 : bbox_offset * 4 * class_num;
for (auto it = all_indices[i].begin(); it != all_indices[i].end(); ++it) {
int label = it->first;
std::vector<int>& indices = it->second;
const dtype* cur_conf_data = conf_data + conf_idx + label * num_priors;
const dtype* cur_bbox_data = bbox_data + bbox_idx;
if (!share_location) {
cur_bbox_data += label * num_priors * 4;
}
for (int j = 0; j < indices.size(); ++j) {
int idx = indices[j];
(*result)[count * 6] = label;
(*result)[count * 6 + 1] = cur_conf_data[idx];
for (int k = 0; k < 4; ++k) {
(*result)[count * 6 + 2 + k] = cur_bbox_data[idx * 4 + k];
}
++count;
}
}
conf_offset += num_priors;
bbox_offset += num_priors;
}
}
TEST(multiclass_nms_host, init) {
MulticlassNmsCompute multiclass_nms;
ASSERT_EQ(multiclass_nms.precision(), PRECISION(kFloat));
ASSERT_EQ(multiclass_nms.target(), TARGET(kHost));
}
TEST(multiclass_nms_host, retrive_op) {
auto multiclass_nms =
KernelRegistry::Global().Create<TARGET(kHost), PRECISION(kFloat)>(
"multiclass_nms");
ASSERT_FALSE(multiclass_nms.empty());
ASSERT_TRUE(multiclass_nms.front());
}
TEST(multiclass_nms_host, compute) {
MulticlassNmsCompute multiclass_nms;
operators::MulticlassNmsParam param;
lite::Tensor bbox, conf, out;
std::vector<float> out_ref;
for (std::vector<int> priors : {std::vector<int>({2, 2, 2})}) {
int N = priors.size();
for (bool share_location : {true}) {
for (int class_num : {1, 4, 10}) {
DDim* bbox_dim;
DDim* conf_dim;
int M = priors[0];
if (share_location) {
bbox_dim = new DDim({N, M, 4});
} else {
bbox_dim = new DDim({class_num, M, 4});
}
conf_dim = new DDim({N, class_num, M});
bbox.Resize(*bbox_dim);
conf.Resize(*conf_dim);
for (int background_id : {0}) {
for (int keep_topk : {1, 5, 10}) {
for (int nms_topk : {1, 5, 10}) {
for (float nms_eta : {1.0, 0.99, 0.9}) {
for (float nms_thresh : {0.5, 0.7}) {
for (float conf_thresh : {0.5, 0.7}) {
auto* conf_data = conf.mutable_data<float>();
auto* bbox_data = bbox.mutable_data<float>();
for (int i = 0; i < bbox_dim->production(); ++i) {
bbox_data[i] = i * 1. / bbox_dim->production();
}
for (int i = 0; i < conf_dim->production(); ++i) {
conf_data[i] = i * 1. / conf_dim->production();
}
param.bboxes = &bbox;
param.scores = &conf;
param.out = &out;
param.background_label = background_id;
param.keep_top_k = keep_topk;
param.nms_top_k = nms_topk;
param.score_threshold = conf_thresh;
param.nms_threshold = nms_thresh;
param.nms_eta = nms_eta;
multiclass_nms.SetParam(param);
multiclass_nms.Run();
auto* out_data = out.mutable_data<float>();
out_ref.clear();
multiclass_nms_compute_ref<float>(
param, class_num, priors, share_location, &out_ref);
EXPECT_EQ(out.dims().production(), out_ref.size());
if (out.dims().production() == out_ref.size()) {
auto* out_ref_data = out_ref.data();
for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
}
}
}
}
}
}
}
}
delete bbox_dim;
delete conf_dim;
}
}
}
}
} // namespace host
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
......@@ -32,8 +32,10 @@ class ReluCompute
std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/relu_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"buffer/relu_kernel.cl",
build_options_,
time_stamp_);
}
void Run() override {
......@@ -46,7 +48,7 @@ class ReluCompute
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
......@@ -74,6 +76,7 @@ class ReluCompute
private:
std::string kernel_func_name_{"relu"};
std::string build_options_{"-DCL_DTYPE_float -DRELU"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......@@ -87,8 +90,10 @@ class SigmoidCompute
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/sigmoid_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"buffer/sigmoid_kernel.cl",
build_options_,
time_stamp_);
}
void Run() override {
......@@ -101,7 +106,7 @@ class SigmoidCompute
auto* x_buf = param.X->data<float, cl::Buffer>();
auto* out_buf = param.Out->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
......@@ -129,6 +134,7 @@ class SigmoidCompute
private:
std::string kernel_func_name_{"sigmoid"};
std::string build_options_{"-DCL_DTYPE_float -DSIGMOID"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -37,11 +37,12 @@ class ActivationComputeImageDefault
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
act_param_ = param_.get_mutable<param_t>();
int act_type = static_cast<int>(act_param_->active_type);
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "ActivationTypeToStr(act_param_->active_type):"
<< ActivationTypeToStr(act_param_->active_type);
#endif
switch (act_type) {
case 1:
kernel_func_name_ = "relu";
......@@ -71,41 +72,70 @@ class ActivationComputeImageDefault
LOG(FATAL) << "This act type:" << act_type << " doesn't support.";
return;
}
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(
kernel_func_name_, "image/activation_kernel.cl", build_options_);
}
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const auto& x_dims = param.X->dims();
auto* x_img = param.X->data<half_t, cl::Image2D>();
auto image_shape = InitImageDimInfoWith(x_dims);
auto* out_img = param.Out->mutable_data<half_t, cl::Image2D>(
image_shape["width"], image_shape["height"]);
const auto& y_dims = param.Out->dims(); // useless: check dim only
#endif
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
context.cl_context()->AddKernel(kernel_func_name_,
"image/activation_kernel.cl",
build_options_,
time_stamp_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
int arg_idx = 0;
cl_int status = kernel.setArg(arg_idx, *x_img);
void ReInitWhenNeeded() override {
act_param_ = param_.get_mutable<param_t>();
auto x_dims = act_param_->X->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
x_img_shape_ = default_convertor.InitImageDimInfoWith(
act_param_->X->dims()); // w, h
out_img_shape_ = default_convertor.InitImageDimInfoWith(
act_param_->Out->dims()); // w, h
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
static_cast<cl::size_type>(x_img_shape_[1])};
}
void Run() override {
auto* x_img = act_param_->X->data<half_t, cl::Image2D>();
auto* out_img = act_param_->Out->mutable_data<half_t, cl::Image2D>(
out_img_shape_[0], out_img_shape_[1]);
auto kernel = kernel_;
cl_int status;
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
status = kernel.setArg(1, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, threshold_);
status = kernel.setArg(2, threshold_);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, scale_);
status = kernel.setArg(3, scale_);
CL_CHECK_FATAL(status);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(param.X->target());
VLOG(4) << TargetToStr(param.Out->target());
VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
<< image_shape["height"];
const auto& x_dims = act_param_->X->dims();
const auto& y_dims = act_param_->Out->dims(); // useless: check dim only
VLOG(4) << TargetToStr(act_param_->X->target());
VLOG(4) << TargetToStr(act_param_->Out->target());
VLOG(4) << "x_img_shape_(w,h):" << x_img_shape_[0] << " "
<< x_img_shape_[1];
VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
<< x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
......@@ -115,13 +145,12 @@ class ActivationComputeImageDefault
VLOG(4) << "kernel func name:" << kernel_func_name_;
#endif
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
static_cast<cl::size_type>(image_shape["height"])};
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
......@@ -131,10 +160,20 @@ class ActivationComputeImageDefault
private:
param_t* act_param_{nullptr};
DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim last_x_dims_;
std::string kernel_func_name_{};
float threshold_{6.f};
float scale_{1.f};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
} // namespace opencl
......
......@@ -43,8 +43,10 @@ class BilinearInterpImageCompute
bilinear_interp_param_ = param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/bilinear_interp_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/bilinear_interp_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
}
......@@ -103,7 +105,7 @@ class BilinearInterpImageCompute
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
......@@ -159,6 +161,7 @@ class BilinearInterpImageCompute
param_t* bilinear_interp_param_{nullptr};
std::string kernel_func_name_{"bilinear_interp"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -38,8 +38,10 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
} else {
kernel_func_name_ = "concat_mul";
}
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/concat_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"buffer/concat_kernel.cl",
build_options_,
time_stamp_);
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
......@@ -88,7 +90,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto inputs = param.x;
int arg_idx = 0;
......@@ -177,6 +179,7 @@ class ConcatCompute : public KernelLite<TARGET(kOpenCL),
param_t* concat_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_float"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -40,8 +40,10 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
kernel_func_name_ = "concat_mul";
}
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(
kernel_func_name_, "image/concat_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/concat_kernel.cl",
build_options_,
time_stamp_);
auto axis = concat_param_->axis;
auto inputs = concat_param_->x;
......@@ -117,7 +119,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto inputs = param.x;
int arg_idx = 0;
......@@ -251,6 +253,7 @@ class ConcatComputeImage : public KernelLite<TARGET(kOpenCL),
param_t* concat_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{" -DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -114,8 +114,10 @@ void ConvCompute::PrepareForRun() {
}
for (size_t i = 0; i < kernel_func_names_.size(); i++) {
context.cl_context()->AddKernel(
kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
context.cl_context()->AddKernel(kernel_func_names_[i],
kernel_func_paths_[i],
build_options_[i],
time_stamp_);
}
}
......@@ -153,7 +155,7 @@ void ConvCompute::GemmlikeConv2d() {
auto& context = ctx_->As<OpenCLContext>();
std::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
auto img2col_kernel = context.cl_context()->GetKernel(kernel_key.str());
int n_threads = c_in * h_out * w_out;
......@@ -218,7 +220,7 @@ void ConvCompute::GemmlikeConv2d() {
int n = h_out * w_out;
VLOG(4) << "m = " << m << " n = " << n << " k = " << k;
kernel_key.str("");
kernel_key << kernel_func_names_[1] << build_options_[1];
kernel_key << kernel_func_names_[1] << build_options_[1] << time_stamp_;
auto gemm_kernel = context.cl_context()->GetKernel(kernel_key.str());
GemmBatched(
gemm_kernel, col_buf, filter_buf, bias_buf, output_buf, bs, m, n, k);
......@@ -249,7 +251,8 @@ void ConvCompute::Conv2d1x1() {
auto& context = ctx_->As<OpenCLContext>();
std::stringstream kernel_key;
kernel_key << kernel_func_names_.front() << build_options_.front();
kernel_key << kernel_func_names_.front() << build_options_.front()
<< time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
......
......@@ -21,6 +21,7 @@
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
namespace paddle {
......@@ -55,6 +56,7 @@ class ConvCompute
std::vector<std::string> kernel_func_names_{};
std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -369,15 +369,17 @@ void ConvImageCompute::PrepareForRun() {
build_options_.push_back(build_options_single);
for (size_t i = 0; i < kernel_func_names_.size(); i++) {
context.cl_context()->AddKernel(
kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
context.cl_context()->AddKernel(kernel_func_names_[i],
kernel_func_paths_[i],
build_options_[i],
time_stamp_);
}
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
std::stringstream kernel_key;
kernel_key << kernel_func_names_[0] << build_options_[0];
kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
VLOG(4) << "kernel ready ... " << kernel_key.str();
......@@ -388,18 +390,43 @@ void ConvImageCompute::PrepareForRun() {
VLOG(4) << "max_work_group_size: " << max_work_group_size;
if (max_work_group_size > 0 && use_lws) {
// local_work_size_ = context.cl_context()->LocalWorkSizeConv1x1(
// global_work_size_, max_work_group_size);
local_work_size_ = context.cl_context()->LocalWorkSize(global_work_size_,
max_work_group_size);
if (max_work_group_size > 0 && use_lws_) {
double min_turn_time = DBL_MAX;
cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
global_work_size_, max_work_group_size);
cl::NDRange last_local_work_size = cl::NDRange{
static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
if (use_turn_) {
for (size_t i = 1; i < 15; i++) {
if (kernel_h == 1 && kernel_w == 1) {
// todo use diff logics
local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
global_work_size_, max_work_group_size, i);
} else {
local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
global_work_size_, max_work_group_size, i);
}
if (last_local_work_size[0] == local_work_size_[0] &&
last_local_work_size[1] == local_work_size_[1] &&
last_local_work_size[2] == local_work_size_[2]) {
// skiped turned lws
continue;
}
auto turn_time = this->Turn(5);
if (min_turn_time > turn_time) {
min_turn_time = turn_time;
best_local_work_size = local_work_size_;
}
last_local_work_size = local_work_size_;
}
}
local_work_size_ = best_local_work_size;
VLOG(4) << "local_work_size_[3D]: {" << local_work_size_[0] << ","
<< local_work_size_[1] << "," << local_work_size_[2] << "}";
}
}
void ConvImageCompute::Conv2d1x1opt() {
void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
......@@ -431,23 +458,6 @@ void ConvImageCompute::Conv2d1x1opt() {
int input_c = input_dims[1];
auto dilations = *param.dilations;
// const std::vector<size_t>& default_work_size =
// DefaultWorkSize(output_dims,
// DDim(std::vector<DDim::value_type>{
// static_cast<int64_t>(out_image_shape["width"]),
// static_cast<int64_t>(out_image_shape["height"])}));
// int c_block = default_work_size[0];
// int w = default_work_size[1];
// int nh = default_work_size[2];
// int maped_w = maptofactor(w, 4);
// auto global_work_size_ =
// cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
// static_cast<size_t>(maped_w),
// static_cast<size_t>(default_work_size.data()[2])};
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image: " << out_image;
VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
......@@ -541,73 +551,12 @@ void ConvImageCompute::Conv2d1x1opt() {
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
#ifdef PROFILE_CONV_KERNEL
bool use_profile = false;
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
double start = GetCurrentUS();
if (use_profile) {
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
} else {
int count = 50;
double sumtime = 0;
if (!use_profile) {
count = 1;
}
for (size_t i = 0; i < count; i++) {
start = GetCurrentUS();
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
if (use_profile) {
if (is_turn) {
event_->wait();
double duration = GetCurrentUS() - start;
sumtime += duration;
}
}
auto dims_string = [](DDimLite dims) -> std::string {
std::ostringstream stream;
stream << "[" << dims[0] << "," << dims[1] << "," << dims[2] << ","
<< dims[3] << "]";
return stream.str();
};
if (use_profile) {
// LOG(INFO) << "input: " << input_dims;
// LOG(INFO) << "filter: " << filter_dims;
// LOG(INFO) << "output: " << output_dims;
std::cout << std::setw(25) << std::left << dims_string(input_dims)
<< std::setw(25) << std::left << dims_string(filter_dims)
<< std::setw(25) << std::left << dims_string(output_dims)
<< std::setw(25) << std::left << sumtime / count << std::endl;
} else {
dims_string(input_dims);
}
}
#endif
}
void ConvImageCompute::Conv2d3x3() {
void ConvImageCompute::Conv2d3x3(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
......@@ -767,9 +716,13 @@ void ConvImageCompute::Conv2d3x3() {
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
}
void ConvImageCompute::Conv2d3x3opt() {
void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
......@@ -890,9 +843,12 @@ void ConvImageCompute::Conv2d3x3opt() {
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
}
void ConvImageCompute::Conv2d5x5() {
void ConvImageCompute::Conv2d5x5(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
......@@ -1018,9 +974,12 @@ void ConvImageCompute::Conv2d5x5() {
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
}
void ConvImageCompute::Conv2d5x5opt() {
void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
......@@ -1134,9 +1093,12 @@ void ConvImageCompute::Conv2d5x5opt() {
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
}
void ConvImageCompute::Conv2d7x7() {
void ConvImageCompute::Conv2d7x7(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
......@@ -1262,8 +1224,12 @@ void ConvImageCompute::Conv2d7x7() {
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
}
void ConvImageCompute::Conv2d7x7opt() {
void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
......@@ -1374,8 +1340,12 @@ void ConvImageCompute::Conv2d7x7opt() {
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_image, event_);
if (is_turn) {
event_->wait();
}
}
void ConvImageCompute::DepthwiseConv2d3x3s1() {
void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
......@@ -1454,9 +1424,13 @@ void ConvImageCompute::DepthwiseConv2d3x3s1() {
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(output_img, event_);
if (is_turn) {
event_->wait();
}
}
void ConvImageCompute::DepthwiseConv2d3x3() {
void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
......@@ -1548,9 +1522,13 @@ void ConvImageCompute::DepthwiseConv2d3x3() {
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(output_img, event_);
if (is_turn) {
event_->wait();
}
}
void ConvImageCompute::DepthwiseConv2d() {
void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
const auto& param = *param_.get_mutable<param_t>();
......@@ -1683,8 +1661,22 @@ void ConvImageCompute::DepthwiseConv2d() {
context.cl_wait_list()->emplace(out_image, event_);
}
void ConvImageCompute::Run() { (this->*impl_)(); }
#undef PROFILE_CONV_KERNEL
void ConvImageCompute::Run() { (this->*impl_)(false); }
double ConvImageCompute::Turn(int times) {
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
auto start = GetCurrentUS();
for (size_t i = 0; i < times; i++) {
(this->*impl_)(true);
}
auto time_diff = (GetCurrentUS() - start) / times;
return time_diff;
}
} // namespace opencl
} // namespace kernels
} // namespace lite
......
......@@ -22,40 +22,42 @@
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::ConvParam;
using kernel_t = void (ConvImageCompute::*)();
using kernel_t = void (ConvImageCompute::*)(bool);
void PrepareForRun() override;
void Run() override;
double Turn(int times = 5);
private:
void Conv2d1x1opt();
void Conv2d3x3();
void Conv2d3x3opt();
void Conv2d5x5();
void Conv2d5x5opt();
void Conv2d7x7();
void Conv2d7x7opt();
void DepthwiseConv2d3x3s1();
void DepthwiseConv2d3x3();
void DepthwiseConv2d();
void Conv2d1x1opt(bool is_turn = false);
void Conv2d3x3(bool is_turn = false);
void Conv2d3x3opt(bool is_turn = false);
void Conv2d5x5(bool is_turn = false);
void Conv2d5x5opt(bool is_turn = false);
void Conv2d7x7(bool is_turn = false);
void Conv2d7x7opt(bool is_turn = false);
void DepthwiseConv2d3x3s1(bool is_turn = false);
void DepthwiseConv2d3x3(bool is_turn = false);
void DepthwiseConv2d(bool is_turn = false);
kernel_t impl_;
std::vector<std::string> kernel_func_names_{};
std::vector<std::string> kernel_func_paths_{};
std::vector<std::string> build_options_{};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor filter_gpu_image_;
Tensor bias_gpu_image_;
......@@ -72,7 +74,8 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
cl::Kernel kernel_;
cl::NDRange local_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
bool use_lws{true};
bool use_lws_{true};
bool use_turn_{false};
};
} // namespace opencl
......
......@@ -44,8 +44,10 @@ class DepthwiseConv2dCompute
build_options_ += " -DRELU6";
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/depthwise_conv2d_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"buffer/depthwise_conv2d_kernel.cl",
build_options_,
time_stamp_);
}
void Run() override {
......@@ -67,7 +69,7 @@ class DepthwiseConv2dCompute
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status;
......@@ -120,6 +122,7 @@ class DepthwiseConv2dCompute
private:
std::string kernel_func_name_{"depthwise_conv2d"};
std::string build_options_{"-DCL_DTYPE_float"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -40,8 +40,10 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(
kernel_func_name_, "image/dropout_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/dropout_kernel.cl",
build_options_,
time_stamp_);
}
void Run() override {
......@@ -63,7 +65,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status;
......@@ -101,6 +103,7 @@ class DropoutComputeImage2D : public KernelLite<TARGET(kOpenCL),
private:
std::string kernel_func_name_{"dropout"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -25,8 +25,10 @@ namespace opencl {
void ElementwiseAddCompute::PrepareForRun() {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"buffer/elementwise_add_kernel.cl",
build_options_,
time_stamp_);
ele_param_ = param_.get_mutable<param_t>();
UpdateParams();
}
......@@ -39,7 +41,7 @@ void ElementwiseAddCompute::Run() {
auto* out_buf = ele_param_->Out->template mutable_data<float, cl::Buffer>(
TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << TargetToStr(ele_param_->X->target());
......
......@@ -16,6 +16,7 @@
#include <memory>
#include <string>
#include "lite/core/kernel.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
......@@ -46,6 +47,7 @@ class ElementwiseAddCompute
param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{"-DCL_DTYPE_float"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -23,10 +23,20 @@ namespace lite {
namespace kernels {
namespace opencl {
void ElementwiseAddImageCompute::PrepareForRun() {
void ElementwiseAddImageCompute::PrepareForRun() {}
void ElementwiseAddImageCompute::ReInitWhenNeeded() {
ele_param_ = param_.get_mutable<param_t>();
auto x_dims = ele_param_->X->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// choose kernel
auto* x = ele_param_->X;
auto* y = ele_param_->Y;
auto* out = ele_param_->Out;
auto axis = ele_param_->axis;
if (y->dims().size() == 4) {
......@@ -49,18 +59,48 @@ void ElementwiseAddImageCompute::PrepareForRun() {
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/elementwise_add_kernel.cl",
build_options_,
time_stamp_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
x_img_shape_ = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
y_img_shape_ = default_convertor.InitImageDimInfoWith(y->dims());
out_img_shape_ =
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
// compute global work size
GetGlobalWorkSize();
}
}
void ElementwiseAddImageCompute::Run() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
void ElementwiseAddImageCompute::GetGlobalWorkSize() {
global_work_size_ = cl::NDRange{static_cast<cl::size_type>(x_img_shape_[0]),
static_cast<cl::size_type>(x_img_shape_[1])};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << x_img_shape_[0] << " "
<< x_img_shape_[1];
#endif
}
void ElementwiseAddImageCompute::Run() {
auto* x = ele_param_->X;
auto* y = ele_param_->Y;
auto* out = ele_param_->Out;
auto axis = ele_param_->axis;
auto x_dims = x->dims();
auto y_dims = y->dims();
auto* x_img = x->data<half_t, cl::Image2D>();
auto* y_img = y->data<half_t, cl::Image2D>();
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
out_img_shape_[1]);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x->target():" << TargetToStr(x->target());
......@@ -70,75 +110,53 @@ void ElementwiseAddImageCompute::Run() {
VLOG(4) << "y->dims():" << y->dims();
VLOG(4) << "out->dims():" << out->dims();
VLOG(4) << "axis:" << axis;
#endif
paddle::lite::CLImageConverterDefault default_convertor;
auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims()); // w, h
auto x_img_width = x_img_shape[0];
auto x_img_height = x_img_shape[1];
auto out_img_shape =
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
auto* x_img = x->data<half_t, cl::Image2D>();
auto* y_img = y->data<half_t, cl::Image2D>();
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape[0],
out_img_shape[1]);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
<< out_img_shape[1];
VLOG(4) << "x_img_shape_[w,h]:" << x_img_shape_[0] << " " << x_img_shape_[1];
VLOG(4) << "y_img_shape_[w,h]:" << y_img_shape_[0] << " " << y_img_shape_[1];
VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
<< out_img_shape_[1];
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
auto y_dims = y->dims();
cl_int status;
auto kernel = kernel_;
if (y_dims.size() == 4) {
cl_int status = kernel.setArg(arg_idx, *x_img);
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
status = kernel.setArg(1, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status);
} else if (y_dims.size() == 1) {
if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
int tensor_w = x->dims()[x->dims().size() - 1];
if (axis == x_dims.size() - 1 || axis == x_dims.size() - 3) {
const int tensor_w = x_dims[x_dims.size() - 1];
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "tensor_w:" << tensor_w;
#endif
cl_int status = kernel.setArg(arg_idx, *x_img);
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *y_img);
status = kernel.setArg(1, *y_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
status = kernel.setArg(3, tensor_w);
CL_CHECK_FATAL(status);
} else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size()
<< ", y->dims.size():" << y->dims().size();
<< ", x->dims().size():" << x_dims.size()
<< ", y->dims.size():" << y_dims.size();
}
} else {
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
<< ", x->dims().size():" << x->dims().size()
<< ", y->dims.size():" << y->dims().size();
<< ", x->dims().size():" << x_dims.size()
<< ", y->dims.size():" << y_dims.size();
}
auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
static_cast<cl::size_type>(x_img_height)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
#endif
auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
......
......@@ -15,8 +15,10 @@
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/opencl/cl_half.h"
#include "lite/core/kernel.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
......@@ -34,6 +36,10 @@ class ElementwiseAddImageCompute
void PrepareForRun() override;
void ReInitWhenNeeded() override;
void GetGlobalWorkSize();
void Run() override;
std::string doc() const override {
......@@ -42,8 +48,21 @@ class ElementwiseAddImageCompute
protected:
param_t* ele_param_{nullptr};
DDim last_x_dims_;
DDim x_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim y_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
std::string kernel_func_name_{"elementwise_add"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
bool first_epoch_for_reinit_{true};
cl::Kernel kernel_;
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -71,8 +71,10 @@ class ElementwiseMulImageCompute
VLOG(4) << "bias_dims.size():" << bias_dims.size();
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/elementwise_mul_kernel.cl",
build_options_,
time_stamp_);
}
void Run() override {
......@@ -114,7 +116,7 @@ class ElementwiseMulImageCompute
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto bias_dims = y->dims();
......@@ -201,6 +203,7 @@ class ElementwiseMulImageCompute
param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_mul"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -49,8 +49,10 @@ void ElementwiseSubImageCompute::PrepareForRun() {
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/elementwise_sub_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/elementwise_sub_kernel.cl",
build_options_,
time_stamp_);
}
void ElementwiseSubImageCompute::Run() {
......@@ -93,7 +95,7 @@ void ElementwiseSubImageCompute::Run() {
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
......
......@@ -17,6 +17,7 @@
#include <string>
#include "lite/backends/opencl/cl_half.h"
#include "lite/core/kernel.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
......@@ -44,6 +45,7 @@ class ElementwiseSubImageCompute
param_t* ele_param_{nullptr};
std::string kernel_func_name_{"elementwise_sub"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -16,6 +16,7 @@
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
......@@ -30,74 +31,98 @@ class FcCompute
public:
using param_t = operators::FcParam;
void PrepareForRun() override {
const auto& param = *param_.get_mutable<param_t>();
const auto x_dims = param.input->dims();
const auto w_dims = param.w->dims();
void PrepareForRun() override {}
void ReInitWhenNeeded() override {
fc_param_ = param_.get_mutable<param_t>();
const auto x_dims = fc_param_->input->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute m,n,k
const auto w_dims = fc_param_->w->dims();
CHECK_GE(x_dims.size(), 2UL);
CHECK_GE(w_dims.size(), 2UL);
CHECK_EQ(param.output->dims().size(), 2UL);
CHECK_EQ(fc_param_->output->dims().size(), 2UL);
m_ = x_dims.Slice(0, param.in_num_col_dims).production();
k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
m_ = x_dims.Slice(0, fc_param_->in_num_col_dims).production();
k_ = x_dims.Slice(fc_param_->in_num_col_dims, x_dims.size()).production();
n_ = w_dims[1];
CHECK_EQ(k_, static_cast<int>(w_dims[0]));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
<< " " << x_dims[3];
VLOG(4) << "w_dims:" << w_dims[0] << " " << w_dims[1] << " " << w_dims[2]
<< " " << w_dims[3];
VLOG(4) << "m_: " << m_ << " n_: " << n_ << " k_: " << k_;
#endif
// choose kernel
if (m_ == 1) { // gemv
kernel_func_name_ = "fc_gemv_1x4";
global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
} else { // gemm
kernel_func_name_ = "fc_gemm_4x4";
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
static_cast<size_t>((n_ + 3) / 4)};
}
#ifndef LITE_SHUTDOWN_LOG
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
#endif
if (param.activation_type == "relu") {
if (fc_param_->activation_type == "relu") {
build_options_ += "-DRELU";
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/fc_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"buffer/fc_kernel.cl",
build_options_,
time_stamp_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
if (m_ == 1) { // gemv
global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
} else { // gemm
global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
static_cast<size_t>((n_ + 3) / 4)};
}
}
void Run() override {
const auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x_buf = param.input->data<float, cl::Buffer>();
auto* w_buf = param.w->data<float, cl::Buffer>();
auto* bias_buf = param.bias->data<float, cl::Buffer>();
auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
auto* out_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, *x_buf);
status = kernel.setArg(0, *x_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *w_buf);
status = kernel.setArg(1, *w_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *bias_buf);
status = kernel.setArg(2, *bias_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_buf);
status = kernel.setArg(3, *out_buf);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(m_));
status = kernel.setArg(4, static_cast<const int>(m_));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(n_));
status = kernel.setArg(5, static_cast<const int>(n_));
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, static_cast<const int>(k_));
status = kernel.setArg(6, static_cast<const int>(k_));
CL_CHECK_FATAL(status);
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
......@@ -111,9 +136,14 @@ class FcCompute
private:
int m_, n_, k_;
param_t* fc_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_float "};
std::string time_stamp_{GetTimeStamp()};
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
cl::NDRange global_work_size_;
cl::Kernel kernel_;
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -28,8 +28,10 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
void PrepareForRun() override {
build_options_ += " -DRELU";
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/elementwise_add_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"buffer/elementwise_add_kernel.cl",
build_options_,
time_stamp_);
ele_param_ = param_.get_mutable<param_t>();
UpdateParams();
auto act_t = static_cast<param_t*>(ele_param_)->act_type;
......
......@@ -16,6 +16,7 @@
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/elementwise_add_image_compute.h"
#include "lite/kernels/opencl/image_helper.h"
namespace paddle {
namespace lite {
......@@ -30,8 +31,10 @@ class FusionElementwiseAddActivationImageCompute
void PrepareForRun() override {
build_options_ += " -DRELU";
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/elementwise_add_kernel.cl",
build_options_,
time_stamp_);
ele_param_ = param_.get_mutable<param_t>();
auto act_t = static_cast<param_t*>(ele_param_)->act_type;
VLOG(4) << "act: " << act_t;
......
......@@ -39,96 +39,120 @@ class GridSamplerImageCompute : public KernelLite<TARGET(kOpenCL),
}
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(kernel_func_name_,
"image/grid_sampler_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
VLOG(4) << "kernel_key: " << kernel_key.str();
}
void ReInitWhenNeeded() override {
grid_param_ = param_.get_mutable<param_t>();
auto x_dims = grid_param_->x->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ =
default_convertor.InitImageDimInfoWith(grid_param_->out->dims());
// compute global work size
GetGlobalWorkSize();
}
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/grid_sampler_kernel.cl", build_options_);
VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
void GetGlobalWorkSize() {
auto default_work_size =
DefaultWorkSize(grid_param_->out->dims(),
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_img_shape_[0]),
static_cast<int64_t>(out_img_shape_[1])}));
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2] / 4)};
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
VLOG(4) << "global_work_size_:[2D]:" << global_work_size_[0] << " "
<< global_work_size_[1] << " " << global_work_size_[2];
#endif
}
void Run() override {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x = grid_param_->x;
auto* out = grid_param_->out;
auto* grid = grid_param_->grid;
auto* out = grid_param_->out;
auto out_dims = out->dims();
auto in_dims = x->dims();
int out_height = out_dims[2];
int out_width = out_dims[3];
auto* x_img = x->data<half_t, cl::Image2D>();
auto* grid_img = x->data<half_t, cl::Image2D>();
auto* out_img = out->mutable_data<half_t, cl::Image2D>(out_img_shape_[0],
out_img_shape_[1]);
#ifndef LITE_SHUTDOWN_LOG
auto in_dims = x->dims();
VLOG(4) << "x->target():" << TargetToStr(x->target());
VLOG(4) << "out->target():" << TargetToStr(out->target());
VLOG(4) << "x->dims():" << in_dims;
VLOG(4) << "out->dims():" << out_dims;
#endif
auto out_image_shape = InitImageDimInfoWith(out_dims);
auto* x_img = x->data<half_t, cl::Image2D>();
// VLOG(4) << "x_image: " << x_img;
auto* grid_img = x->data<half_t, cl::Image2D>();
// VLOG(4) << "grid_img: " << grid_img;
auto* out_img = out->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
#ifndef LITE_SHUTDOWN_LOG
// VLOG(4) << "out_image" << out_img;
VLOG(4) << "out_image_shape[w,h]:" << out_image_shape["width"] << " "
<< out_image_shape["height"];
VLOG(4) << "out_img_shape_[w,h]:" << out_img_shape_[0] << " "
<< out_img_shape_[1];
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
int out_height = out_dims[2];
int out_width = out_dims[3];
auto default_work_size =
DefaultWorkSize(out_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(out_image_shape["width"]),
static_cast<int64_t>(out_image_shape["height"])}));
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
#endif
cl_int status = kernel.setArg(arg_idx++, *x_img);
cl_int status;
auto kernel = kernel_;
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *grid_img);
status = kernel.setArg(1, *grid_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *out_img);
status = kernel.setArg(2, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_height);
status = kernel.setArg(3, out_height);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, out_width);
status = kernel.setArg(4, out_width);
CL_CHECK_FATAL(status);
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2] / 4)};
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
CL_CHECK_FATAL(status);
context.cl_wait_list()->emplace(out_img, event_);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
#endif
}
protected:
param_t* grid_param_{nullptr};
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
std::string kernel_func_name_{"grid_sampler"};
cl::Kernel kernel_;
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -74,6 +74,12 @@ static std::vector<size_t> DefaultWorkSize(const DDim& image_dim,
LOG(FATAL) << " not support this dim, need imp ";
}
static const std::string GetTimeStamp() {
struct timeval time;
gettimeofday(&time, NULL);
return std::to_string(time.tv_usec);
}
} // namespace opencl
} // namespace kernels
} // namespace lite
......
......@@ -60,8 +60,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/instance_norm_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/instance_norm_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
}
......@@ -115,7 +117,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
out_image_shape["width"], out_image_shape["height"]);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status = kernel.setArg(0, out_w);
......@@ -180,8 +182,10 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
bias_image_.mutable_data<half_t, cl::Image2D>(
scale_img_size[0], scale_img_size[1], bias_img.data());
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/instance_norm_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/instance_norm_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
}
......@@ -234,7 +238,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto* scale_img = scale_image_.data<half_t, cl::Image2D>();
auto* bias_img = bias_image_.data<half_t, cl::Image2D>();
......@@ -271,6 +275,7 @@ class InstanceNormImageCompute : public KernelLite<TARGET(kOpenCL),
param_t* instance_norm_param_{nullptr};
std::string kernel_func_name_{"instance_norm_onnx"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
Tensor scale_image_;
Tensor bias_image_;
......
......@@ -48,7 +48,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
beta_ = lrn_param_->beta;
norm_region_ = lrn_param_->norm_region;
context.cl_context()->AddKernel(
kernel_func_name_, "image/lrn_kernel.cl", build_options_);
kernel_func_name_, "image/lrn_kernel.cl", build_options_, time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
}
......@@ -91,7 +91,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
......@@ -152,6 +152,7 @@ class LrnImageCompute : public KernelLite<TARGET(kOpenCL),
std::string norm_region_{"AcrossChannels"};
std::string kernel_func_name_{"lrn"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -16,6 +16,7 @@
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
......@@ -32,8 +33,10 @@ class MulCompute
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/mat_mul_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"buffer/mat_mul_kernel.cl",
build_options_,
time_stamp_);
const auto& param = *param_.get_mutable<param_t>();
const auto* x_data = param.x->data<float>();
const auto* y_data = param.y->data<float>();
......@@ -68,7 +71,7 @@ class MulCompute
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status;
......@@ -103,6 +106,7 @@ class MulCompute
int m_, n_, k_;
std::string kernel_func_name_{"mat_mul"};
std::string build_options_{"-DCL_DTYPE_float"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -38,8 +38,10 @@ class NearestInterpComputeImageDefault
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/nearest_interp_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
}
......@@ -66,7 +68,7 @@ class NearestInterpComputeImageDefault
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
......@@ -121,6 +123,7 @@ class NearestInterpComputeImageDefault
private:
std::string kernel_func_name_{"nearest_interp"};
std::string build_options_{" -DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -52,8 +52,10 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
}
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/pad2d_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/pad2d_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
}
......@@ -93,7 +95,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
#endif
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int arg_idx = 0;
......@@ -159,6 +161,7 @@ class Pad2dCompute : public KernelLite<TARGET(kOpenCL),
param_t* pad2d_param_{nullptr};
std::string kernel_func_name_{};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -37,8 +37,10 @@ class PoolCompute
const auto& param = *param_.get_mutable<param_t>();
kernel_func_name_ += param.pooling_type;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "buffer/pool_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"buffer/pool_kernel.cl",
build_options_,
time_stamp_);
}
void Run() override {
......@@ -69,7 +71,7 @@ class PoolCompute
auto* output_buf =
param.output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status;
auto numel = out_dims.production();
......@@ -117,6 +119,7 @@ class PoolCompute
private:
std::string kernel_func_name_{"pool_"};
std::string build_options_{"-DCL_DTYPE_float"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -47,7 +47,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(
kernel_func_name_, "image/pool_kernel.cl", build_options_);
kernel_func_name_, "image/pool_kernel.cl", build_options_, time_stamp_);
}
void Run() override {
......@@ -112,7 +112,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
// VLOG(4) << "out_image" << out_img;
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
int c_block = (out_dims[1] + 3) / 4;
......@@ -164,6 +164,7 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
private:
std::string kernel_func_name_{"pool_"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -36,8 +36,10 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(
kernel_func_name_, "image/reshape_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/reshape_kernel.cl",
build_options_,
time_stamp_);
}
void Run() override {
......@@ -110,7 +112,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
#ifndef LITE_SHUTDOWN_LOG
......@@ -166,6 +168,7 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
private:
std::string kernel_func_name_{"reshape"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -37,53 +37,66 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
context.cl_context()->AddKernel(kernel_func_name_,
"image/scale_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(
kernel_func_name_, "image/scale_kernel.cl", build_options_);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
}
void ReInitWhenNeeded() override {
scale_param_ = param_.get_mutable<param_t>();
auto x_dims = scale_param_->x->dims();
if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
first_epoch_for_reinit_) {
last_x_dims_ = x_dims;
first_epoch_for_reinit_ = false;
// compute image shape
paddle::lite::CLImageConverterDefault default_convertor;
out_img_shape_ =
default_convertor.InitImageDimInfoWith(scale_param_->output->dims());
// compute global work size
GetGlobalWorkSize();
}
}
void GetGlobalWorkSize() {
global_work_size_ =
cl::NDRange{static_cast<cl::size_type>(out_img_shape_[0]),
static_cast<cl::size_type>(out_img_shape_[1])};
}
void Run() override {
const auto& param = *param_.get_mutable<param_t>();
const auto& in_dims = param.x->dims();
auto* x_img = param.x->data<half_t, cl::Image2D>();
const float scale = param.scale;
const float bias = param.bias;
// LOG(INFO) << "x_image" << x_img;
auto out_image_shape = InitImageDimInfoWith(in_dims);
#ifndef LITE_SHUTDOWN_LOG
VLOG(4) << "out_image_shape = " << out_image_shape["width"] << " "
<< out_image_shape["height"];
#endif
auto* out_img = param.output->mutable_data<half_t, cl::Image2D>(
out_image_shape["width"], out_image_shape["height"]);
// LOG(INFO) << "out_image" << out_img;
auto* x_img = scale_param_->x->data<half_t, cl::Image2D>();
auto* out_img = scale_param_->output->mutable_data<half_t, cl::Image2D>(
out_img_shape_[0], out_img_shape_[1]);
const float scale = scale_param_->scale;
const float bias = scale_param_->bias;
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(out_image_shape["width"]),
static_cast<cl::size_type>(out_image_shape["height"])};
auto kernel = kernel_;
cl_int status;
int arg_idx = 0;
status = kernel.setArg(arg_idx, *x_img);
status = kernel.setArg(0, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, *out_img);
status = kernel.setArg(1, *out_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, scale);
status = kernel.setArg(2, scale);
CL_CHECK_FATAL(status);
status = kernel.setArg(++arg_idx, bias);
status = kernel.setArg(3, bias);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
global_work_size_,
cl::NullRange,
nullptr,
event_.get());
......@@ -94,7 +107,17 @@ class ScaleComputeImage2D : public KernelLite<TARGET(kOpenCL),
private:
std::string kernel_func_name_{"scale"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
param_t* scale_param_{nullptr};
cl::Kernel kernel_;
bool first_epoch_for_reinit_{true};
DDim last_x_dims_;
DDim out_img_shape_ = DDim(std::vector<DDim::value_type>(
{static_cast<DDim::value_type>(1), static_cast<DDim::value_type>(1)}));
cl::NDRange global_work_size_ = cl::NDRange{
static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
};
} // namespace opencl
......
......@@ -38,8 +38,10 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
void PrepareForRun() override {
auto& context = ctx_->As<OpenCLContext>();
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
context.cl_context()->AddKernel(
kernel_func_name_, "image/slice_kernel.cl", build_options_);
context.cl_context()->AddKernel(kernel_func_name_,
"image/slice_kernel.cl",
build_options_,
time_stamp_);
}
void Run() override {
......@@ -68,7 +70,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
cl_int status;
......@@ -108,6 +110,7 @@ class SliceComputeImage2D : public KernelLite<TARGET(kOpenCL),
private:
std::string kernel_func_name_{"slice"};
std::string build_options_{"-DCL_DTYPE_half"};
std::string time_stamp_{GetTimeStamp()};
std::shared_ptr<cl::Event> event_{new cl::Event};
};
......
......@@ -25,6 +25,7 @@ lite_cc_library(subgraph_bridge_layer_norm_op_xpu SRCS layer_norm_op.cc DEPS ${x
lite_cc_library(subgraph_bridge_dropout_op_xpu SRCS dropout_op.cc DEPS ${xpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_matmul_op_xpu SRCS matmul_op.cc DEPS ${xpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_cast_op_xpu SRCS cast_op.cc DEPS ${xpu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_yolo_box_op_xpu SRCS yolo_box_op.cc DEPS ${xpu_subgraph_bridge_deps})
set(xpu_subgraph_bridges
subgraph_bridge_registry
......@@ -48,6 +49,7 @@ set(xpu_subgraph_bridges
subgraph_bridge_dropout_op_xpu
subgraph_bridge_matmul_op_xpu
subgraph_bridge_cast_op_xpu
subgraph_bridge_yolo_box_op_xpu
CACHE INTERNAL "xpu_subgraph_bridges")
message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}")
......@@ -37,3 +37,4 @@ USE_SUBGRAPH_BRIDGE(gelu, kXPU);
USE_SUBGRAPH_BRIDGE(dropout, kXPU);
USE_SUBGRAPH_BRIDGE(matmul, kXPU);
USE_SUBGRAPH_BRIDGE(cast, kXPU);
USE_SUBGRAPH_BRIDGE(yolo_box, kXPU);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/graph.h"
#include "lite/kernels/xpu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace xpu {
int YoloBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[XPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindTensor(x_name);
auto img_size_name = op_info->Input("ImgSize").front();
auto img_size = scope->FindTensor(img_size_name);
auto boxes_name = op_info->Output("Boxes").front();
auto scores_name = op_info->Output("Scores").front();
auto anchors = op_info->GetAttr<std::vector<int>>("anchors");
auto class_num = op_info->GetAttr<int>("class_num");
auto conf_thresh = op_info->GetAttr<float>("conf_thresh");
auto downsample_ratio = op_info->GetAttr<int>("downsample_ratio");
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x);
}
// ImgSize node
std::shared_ptr<Node> img_size_node = nullptr;
if (graph->Has(img_size_name)) {
img_size_node = graph->Get(img_size_name);
} else {
img_size_node = graph->Add(img_size_name, *img_size);
}
// Softmax node
auto yolo_box_data =
graph->builder_.CreateYoloBox(*x_node->data(),
*img_size_node->data(),
CvtShape<xtcl::Integer>(anchors),
class_num,
conf_thresh,
downsample_ratio);
graph->Add(boxes_name, graph->builder_.GetField(yolo_box_data, 0));
graph->Add(scores_name, graph->builder_.GetField(yolo_box_data, 1));
return SUCCESS;
}
} // namespace xpu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(yolo_box,
kXPU,
paddle::lite::subgraph::xpu::YoloBoxConverter);
......@@ -34,7 +34,7 @@ int SubgraphEngine::BuildDeviceProgram() {
subgraph::xpu::Graph graph;
const auto& bridges = subgraph::Registry::Instance();
for (auto& inst : origin_program_) {
auto op = inst.op();
auto op = const_cast<OpLite*>(inst.op());
CHECK(op);
op->CheckShape();
op->InferShape();
......@@ -43,10 +43,8 @@ int SubgraphEngine::BuildDeviceProgram() {
return subgraph::FAILED;
}
auto kernel = inst.kernel();
status |=
bridges.Select(op_type, TARGET(kXPU))(reinterpret_cast<void*>(&graph),
const_cast<OpLite*>(op),
const_cast<KernelBase*>(kernel));
status |= bridges.Select(op_type, TARGET(kXPU))(
reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED;
}
......
......@@ -382,7 +382,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
pb_dims->Resize(static_cast<int>(dims.size()), 0);
auto dims_vec = dims.Vectorize();
std::copy(dims_vec.begin(), dims_vec.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
int32_t size = desc.ByteSizeLong();
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
......
......@@ -141,13 +141,12 @@ add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS})
# 4. training op
add_operator(mean_op extra SRCS mean_op.cc DEPS ${op_DEPS})
if (LITE_WITH_TRAIN)
add_operator(mean_grad_op extra SRCS mean_grad_op.cc DEPS ${op_DEPS})
add_operator(activation_grad_ops basic SRCS activation_grad_ops.cc DEPS ${op_DEPS})
add_operator(elementwise_grad_op extra SRCS elementwise_grad_ops.cc DEPS ${op_DEPS})
add_operator(mul_grad_op basic SRCS mul_grad_op.cc DEPS ${op_DEPS})
add_operator(sgd_op extra SRCS sgd_op.cc DEPS ${op_DEPS})
endif()
add_operator(mean_grad_op train SRCS mean_grad_op.cc DEPS ${op_DEPS})
add_operator(activation_grad_ops train SRCS activation_grad_ops.cc DEPS ${op_DEPS})
add_operator(elementwise_grad_op train SRCS elementwise_grad_ops.cc DEPS ${op_DEPS})
add_operator(mul_grad_op train SRCS mul_grad_op.cc DEPS ${op_DEPS})
add_operator(sgd_op train SRCS sgd_op.cc DEPS ${op_DEPS})
if (NOT LITE_WITH_X86)
lite_cc_test(test_fc_op SRCS fc_op_test.cc
......
......@@ -71,6 +71,9 @@ bool ActivationOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
} else if (opdesc.Type() == "exp") {
// exp
param_.active_type = lite_api::ActivationType::kExp;
} else if (opdesc.Type() == "abs") {
// abs
param_.active_type = lite_api::ActivationType::kAbs;
}
VLOG(4) << "opdesc.Type():" << opdesc.Type();
......@@ -92,6 +95,7 @@ REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(abs, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册