提交 9d70dd4d 编写于 作者: J jiweibo

Merge branch 'develop' into add_matmul_op

......@@ -3,7 +3,8 @@ repos:
sha: v1.0.1
hooks:
- id: remove-crlf
files: (?!.*third_party)^.*$ | (?!.*book)^.*$ ^mobile/ ^metal/ ^web/
files: (?!.*third_party)^.*$|(?!.*book)^.*$
exclude: ^(mobile/|metal/|web/)
#- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
#sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
#hooks:
......@@ -16,7 +17,7 @@ repos:
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
files: (?!.*third_party)^.*$ | (?!.*book)^.*$
files: (?!.*third_party)^.*$|(?!.*book)^.*$
- id: end-of-file-fixer
- repo: local
hooks:
......@@ -25,7 +26,8 @@ repos:
description: Format files with ClangFormat.
entry: bash ./tools/codestyle/clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ ^mobile/ ^metal/ ^web/
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
exclude: ^(mobile/|metal/|web/)
- repo: local
hooks:
- id: cpplint-cpp-source
......@@ -33,7 +35,8 @@ repos:
description: Check C++ code style using cpplint.py.
entry: bash ./tools/codestyle/cpplint_pre_commit.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$ ^mobile/ ^metal/ ^web/
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
exclude: ^(mobile/|metal/|web/)
#- repo: local
#hooks:
#- id: pylint-doc-string
......@@ -48,5 +51,6 @@ repos:
name: copyright_checker
entry: python ./tools/codestyle/copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$ ^mobile/ ^metal/ ^web/
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$|(?!.*book)^.*$
exclude: ^(mobile/|metal/|web/)
......@@ -9,18 +9,17 @@ os:
addons:
apt:
packages:
- git
- python
- python-pip
- python2.7-dev
- libc6-i386
- curl
compiler:
- clang
# - git
# - python
# - python-pip
# - python2.7-dev
# - libc6-i386
# - curl
- clang-format-3.8
before_install:
- sudo pip install -U virtualenv pre-commit pip
- sudo pip install cpplint pre-commit
- sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format
# Download and install recent cmake
script:
......
......@@ -11,6 +11,8 @@ cd `dirname $0`
cd ..
export PATH=/usr/bin:$PATH
pre-commit install
which clang-format
clang-format --version
if ! pre-commit run -a ; then
ls -lh
......
......@@ -80,6 +80,8 @@ option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF)
option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF)
option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF)
option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
# publish options
option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
"A path setting third party libraries download & build directories.")
......@@ -93,7 +95,7 @@ endif()
# check options
if (LITE_ON_TINY_PUBLISH)
if (NOT (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_JAVA AND NOT WITH_TESTING))
if (NOT (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND NOT WITH_TESTING))#LITE_WITH_JAVA AND
message(FATAL_ERROR "LITE_ON_TINY_PUBLISH=ON must be used with WITH_LITE=ON LITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON LITE_WITH_JAVA=ON WITH_TESTING=OFF")
return()
endif()
......
......@@ -127,6 +127,7 @@ elseif(ARM_TARGET_OS STREQUAL "ios64")
else()
return()
endif()
add_definitions(-DTARGET_IOS)
# if do not specify the ARM_TARGET_ARCH_ABI then use default all supported
if(ARM_TARGET_ARCH_ABI STREQUAL "armv7"
......
......@@ -57,6 +57,8 @@ function (lite_deps TARGET)
endforeach(var)
endif()
if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
foreach(var ${lite_deps_HVY_DEPS})
set(deps ${deps} ${var})
......@@ -182,9 +184,16 @@ function(lite_cc_test TARGET)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
ARGS
COMPILE_LEVEL # (basic|extra)
)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (args_COMPILE_LEVEL STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
MESSAGE(STATUS "Ignore test ${TARGET} due to compile level ${args_COMPILE_LEVEL}")
return()
endif()
set(deps "")
lite_deps(deps
DEPS ${args_DEPS}
......@@ -207,6 +216,117 @@ function(lite_cc_test TARGET)
endif()
endfunction()
set(arm_kernels CACHE INTERNAL "arm kernels")
set(x86_kernels CACHE INTERNAL "x86 kernels")
set(fpga_kernels CACHE INTERNAL "fpga kernels")
set(npu_kernels CACHE INTERNAL "npu kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels")
set(host_kernels CACHE INTERNAL "host kernels")
set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
file(WRITE ${kernels_src_list} "") # clean
# add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA)
# level: one of (basic, extra)
function(add_kernel TARGET device level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
return()
endif()
if ("${device}" STREQUAL "Host")
set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "ARM")
if (NOT LITE_WITH_ARM)
return()
endif()
set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "X86")
if (NOT LITE_WITH_X86)
return()
endif()
set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "NPU")
if (NOT LITE_WITH_NPU)
return()
endif()
set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "FPGA")
if (NOT LITE_WITH_FPGA)
return()
endif()
set(fpga_kernels "${fpga_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL)
return()
endif()
set(opencl_kernels "${opencl_kernels};${TARGET}" CACHE INTERNAL "")
endif()
foreach(src ${args_SRCS})
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
lite_cc_library(${TARGET} SRCS ${args_SRCS}
DEPS ${args_DEPS}
X86_DEPS ${args_X86_DEPS}
CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS}
ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
)
endfunction()
set(ops CACHE INTERNAL "ops")
set(ops_src_list "${CMAKE_BINARY_DIR}/ops_src_list.txt")
file(WRITE ${ops_src_list} "") # clean
# add an operator
# level: one of (basic, extra)
function(add_operator TARGET level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
return()
endif()
set(ops "${ops};${TARGET}" CACHE INTERNAL "source")
foreach(src ${args_SRCS})
file(APPEND ${ops_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
lite_cc_library(${TARGET} SRCS ${args_SRCS}
DEPS ${args_DEPS}
X86_DEPS ${args_X86_DEPS}
CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS}
ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
)
endfunction()
# Bundle several static libraries into one.
function(bundle_static_library tgt_name bundled_tgt_name fake_target)
......
......@@ -32,7 +32,11 @@ ELSE(WIN32)
SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
"Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
ENDIF()
set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
IF(ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"
OR ARM_TARGET_OS STREQUAL "ios" OR ARM_TARGET_OS STREQUAL "ios64")
ELSE()
set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
ENDIF()
ELSE(APPLE)
IF(EXISTS "/etc/issue")
......
......@@ -13,7 +13,6 @@ set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
add_subdirectory(utils)
add_subdirectory(operators)
add_subdirectory(kernels)
......@@ -78,14 +77,16 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
COMMAND cp "${CMAKE_BINARY_DIR}/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
)
add_dependencies(publish_inference_cxx_lib model_optimize_tool)
add_dependencies(publish_inference_cxx_lib paddle_code_generator)
add_dependencies(publish_inference_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cxx_lib test_model_bin)
add_dependencies(publish_inference publish_inference_cxx_lib)
add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
if(NOT IOS)
add_dependencies(publish_inference_cxx_lib model_optimize_tool)
add_dependencies(publish_inference_cxx_lib paddle_code_generator)
add_dependencies(publish_inference_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cxx_lib test_model_bin)
add_dependencies(publish_inference publish_inference_cxx_lib)
add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
endif()
endif()
......
......@@ -17,6 +17,7 @@ if(LITE_WITH_FPGA)
endif()
message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}")
message(STATUS "get Host kernels ${host_kernels}")
message(STATUS "get ARM kernels ${arm_kernels}")
message(STATUS "get NPU kernels ${npu_kernels}")
......@@ -117,7 +118,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
add_dependencies(test_mobilenetv1 extern_lite_download_mobilenet_v1_tar_gz)
set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set_target_properties(test_mobilenetv1 PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
lite_cc_test(test_mobilenetv2 SRCS mobilenetv2_test.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
......@@ -125,7 +126,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
--model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu SERIAL)
add_dependencies(test_mobilenetv2 extern_lite_download_mobilenet_v2_relu_tar_gz)
set_target_properties(test_mobilenetv2 PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
lite_cc_test(test_resnet50 SRCS resnet50_test.cc
DEPS ${lite_model_test_DEPS}
CL_DEPS ${opencl_kernels}
......@@ -145,8 +146,13 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/opencl
--model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
# lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
# DEPS ${lite_model_test_DEPS})
# lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
# DEPS ${lite_model_test_DEPS})
# lite_cc_test(model_run_test_image SRCS model_run_test_image.cc
# DEPS ${lite_model_test_DEPS}
# CL_DEPS ${opencl_kernels}
# FPGA_DEPS ${fpga_kernels})
endif()
# These tests needs CLI arguments, and is not supported in ARM CI.
......@@ -169,7 +175,11 @@ lite_cc_library(paddle_api SRCS paddle_api.cc DEPS op_params tensor)
#-----------------------------------------------------------------------------------------------------
# The final inference library for both CxxConfig and MobileConfig.
lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api)
if (LITE_ON_TINY_PUBLISH)
lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api stream)
else()
lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api paddle_api)
endif()
if (NOT LITE_ON_TINY_PUBLISH)
lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api paddle_api light_api
${ops}
......
......@@ -21,6 +21,8 @@
#ifndef LITE_WITH_FPGA
USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
#else
USE_LITE_KERNEL(feed, kFPGA, kFP16, kNHWC, def);
USE_LITE_KERNEL(fetch, kFPGA, kFP16, kNHWC, def);
......
......@@ -73,9 +73,12 @@ USE_LITE_OP(prior_box)
USE_LITE_OP(density_prior_box)
USE_LITE_OP(reshape)
USE_LITE_OP(reshape2)
USE_LITE_OP(flatten)
USE_LITE_OP(flatten2)
USE_LITE_OP(split)
USE_LITE_OP(fake_quantize_moving_average_abs_max);
USE_LITE_OP(fake_dequantize_max_abs);
USE_LITE_OP(fake_quantize_range_abs_max);
USE_LITE_OP(calib);
USE_LITE_OP(calib_once);
USE_LITE_OP(norm);
......
......@@ -20,7 +20,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
else()
add_library(paddle_lite_jni SHARED "")
target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
#add_dependencies(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
endif()
if (APPLE)
......
......@@ -30,6 +30,9 @@ DEFINE_string(input_shape,
"1,3,224,224",
"input shapes, separated by colon and comma");
DEFINE_string(result_filename, "", "save test result");
DEFINE_bool(run_model_optimize,
false,
"apply model_optimize_tool to model, use optimized model to test");
namespace paddle {
namespace lite_api {
......@@ -69,10 +72,10 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
#ifdef LITE_WITH_ARM
lite::DeviceInfo::Init();
if (thread_num == 1) {
lite::DeviceInfo::Global().SetRunMode(lite::LITE_POWER_HIGH, thread_num);
lite::DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
LOG(INFO) << "LITE_POWER_HIGH";
} else {
lite::DeviceInfo::Global().SetRunMode(lite::LITE_POWER_NO_BIND, thread_num);
lite::DeviceInfo::Global().SetRunMode(LITE_POWER_NO_BIND, thread_num);
LOG(INFO) << "LITE_POWER_NO_BIND";
}
#endif
......@@ -172,13 +175,17 @@ int main(int argc, char** argv) {
}
// Output optimized model
paddle::lite_api::OutputOptModel(
FLAGS_model_dir, save_optimized_model_dir, input_shapes);
if (FLAGS_run_model_optimize) {
paddle::lite_api::OutputOptModel(
FLAGS_model_dir, save_optimized_model_dir, input_shapes);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
// Run inference using optimized model
std::string run_model_dir =
FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
paddle::lite_api::Run(input_shapes,
save_optimized_model_dir,
run_model_dir,
FLAGS_repeats,
FLAGS_threads,
FLAGS_warmup,
......
......@@ -71,6 +71,13 @@ const lite::Tensor *Predictor::GetOutput(size_t offset) const {
return &fetch_list.at(offset);
}
const std::vector<lite::Tensor> *Predictor::GetOutputs() const {
auto *_fetch_list = exec_scope_->FindVar("fetch");
CHECK(_fetch_list) << "no fatch variable in exec_scope";
auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
return &fetch_list;
}
const cpp::ProgramDesc &Predictor::program_desc() const {
return program_desc_;
}
......
......@@ -69,6 +69,7 @@ class LITE_API Predictor {
// Get offset-th col of fetch results.
const lite::Tensor* GetOutput(size_t offset) const;
const std::vector<lite::Tensor>* GetOutputs() const;
const cpp::ProgramDesc& program_desc() const;
const lite::Tensor* GetTensor(const std::string& name) const;
......
......@@ -28,7 +28,7 @@ namespace lite {
void TestModel(const std::vector<Place> &valid_places,
const Place &preferred_place) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
predictor.Build(FLAGS_model_dir, preferred_place, valid_places);
......
......@@ -28,7 +28,7 @@ namespace lite {
#ifdef LITE_WITH_ARM
TEST(InceptionV4, test) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
......
......@@ -40,6 +40,10 @@ class LightPredictorImpl : public PaddlePredictor {
void LightPredictorImpl::Init(const MobileConfig& config) {
// LightPredictor Only support NaiveBuffer backend in publish lib
#ifdef LITE_WITH_ARM
lite::DeviceInfo::Init();
lite::DeviceInfo::Global().SetRunMode(config.power_mode(), config.threads());
#endif
raw_predictor_.reset(new lite::LightPredictor(config.model_dir(),
LiteModelType::kNaiveBuffer));
}
......
......@@ -29,7 +29,7 @@ void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place,
bool use_npu = false) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
predictor.Build(FLAGS_model_dir, preferred_place, valid_places);
......
......@@ -29,7 +29,7 @@ namespace lite {
void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
predictor.Build(FLAGS_model_dir, preferred_place, valid_places);
......
......@@ -33,7 +33,7 @@ void TestModel(const std::vector<Place>& valid_places,
bool gen_npu = false,
bool save_model = false) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
predictor.Build(model_dir, preferred_place, valid_places);
......
......@@ -29,7 +29,7 @@ namespace lite {
void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
predictor.Build(FLAGS_model_dir, preferred_place, valid_places);
......
......@@ -34,7 +34,7 @@ void TestModel(const std::vector<Place>& valid_places,
bool gen_npu = false,
bool save_model = false) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
predictor.Build(model_dir, preferred_place, valid_places);
......
......@@ -33,7 +33,7 @@ DEFINE_string(valid_targets,
"arm",
"The targets this model optimized for, should be one of (arm, "
"opencl, x86), splitted by space");
DEFINE_bool(int8_mode, false, "Support Int8 quantitative mode");
DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
namespace paddle {
namespace lite_api {
......@@ -62,7 +62,7 @@ void Main() {
CHECK(!valid_places.empty())
<< "At least one target should be set, should set the "
"command argument 'valid_targets'";
if (FLAGS_int8_mode) {
if (FLAGS_prefer_int8_kernel) {
LOG(WARNING) << "Int8 mode is only support by ARM target";
valid_places.push_back(Place{TARGET(kARM), PRECISION(kInt8)});
config.set_preferred_place(Place{TARGET(kARM), PRECISION(kInt8)});
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
TEST(model, test) {
#ifdef LITE_WITH_ARM
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kInt8)}});
auto precision = PRECISION(kFloat);
if (FLAGS_int8) {
precision = PRECISION(kInt8);
}
predictor.Build(
FLAGS_model_dir, Place{TARGET(kARM), precision}, valid_places);
int im_width = FLAGS_im_width;
int im_height = FLAGS_im_height;
auto* input_tensor = predictor.GetInput(0);
auto in_dims = input_tensor->dims();
input_tensor->Resize(
DDim(std::vector<DDim::value_type>({1, 3, im_width, im_height})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor.Run();
}
auto* output_tensors = predictor.GetOutputs();
LOG(INFO) << "======output:========";
for (auto t : *output_tensors) {
LOG(INFO) << t;
}
LOG(INFO)
<< "=====RUN_finished!!============= Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
#endif
}
} // namespace lite
} // namespace paddle
......@@ -64,7 +64,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
const int warmup_times = 0) {
#ifdef LITE_WITH_ARM
lite::DeviceInfo::Init();
lite::DeviceInfo::Global().SetRunMode(lite::LITE_POWER_HIGH, thread_num);
lite::DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, thread_num);
#endif
lite_api::MobileConfig config;
config.set_model_dir(model_dir);
......
......@@ -29,7 +29,7 @@ void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place,
bool use_npu = false) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
predictor.Build(FLAGS_model_dir, preferred_place, valid_places);
......
......@@ -110,7 +110,18 @@ class LITE_API CxxConfig : public ConfigBase {
/// MobileConfig is the config for the light weight predictor, it will skip
/// IR optimization or other unnecessary stages.
class LITE_API MobileConfig : public ConfigBase {};
class LITE_API MobileConfig : public ConfigBase {
PowerMode mode_{LITE_POWER_HIGH};
int threads_{1};
public:
MobileConfig(Place preferred_place=Place(TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW)),
PowerMode mode=LITE_POWER_HIGH, int threads=1) : mode_(mode), threads_(threads) {}
void set_power_mode(PowerMode mode) { mode_ = mode; }
void set_threads(int threads) { threads_ = threads; }
PowerMode power_mode() const { return mode_; }
int threads() const { return threads_; }
};
template <typename ConfigT>
std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);
......
......@@ -70,6 +70,14 @@ enum class DataLayoutType : int {
kAny = 2, // any data layout
NUM = 4, // number of fields.
};
typedef enum {
LITE_POWER_HIGH = 0,
LITE_POWER_LOW = 1,
LITE_POWER_FULL = 2,
LITE_POWER_NO_BIND = 3,
LITE_POWER_RAND_HIGH = 4,
LITE_POWER_RAND_LOW = 5
} PowerMode;
enum class ActivationType : int {
kIndentity = 0,
......
......@@ -28,7 +28,7 @@ namespace lite {
#ifdef LITE_WITH_ARM
TEST(ResNet18, test) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
......
......@@ -29,7 +29,7 @@ namespace lite {
void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
predictor.Build(FLAGS_model_dir, preferred_place, valid_places);
......
......@@ -28,7 +28,7 @@ namespace lite {
void TestModel(const std::vector<Place>& valid_places,
const Place& preferred_place) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
predictor.Build(FLAGS_model_dir, preferred_place, valid_places);
......
......@@ -23,6 +23,9 @@ DEFINE_string(model_dir, "", "model dir");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_int32(threads, 1, "threads num");
DEFINE_int32(im_width, 224, "image width");
DEFINE_int32(im_height, 224, "image height");
DEFINE_bool(int8, false, "is run int8");
namespace paddle {
namespace lite {
......
......@@ -28,7 +28,7 @@ namespace lite {
#ifdef LITE_WITH_ARM
TEST(unet, test) {
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, FLAGS_threads);
lite::Predictor predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
......
......@@ -65,7 +65,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
conv_direct_3x3s1.cc
conv_direct_3x3s2.cc
conv_direct.cc
conv_depthwise_3x3_int7.cc
conv_depthwise_3x3_int8.cc
conv_depthwise_5x5s1_int8.cc
conv_depthwise_3x3p0.cc
......
此差异已折叠。
......@@ -51,7 +51,7 @@ void density_prior_box(const lite::Tensor* input,
const std::vector<float>& min_size_,
const std::vector<float>& fixed_size_,
const std::vector<float>& fixed_ratio_,
const std::vector<float>& density_size_,
const std::vector<int>& density_size_,
const std::vector<float>& max_size_,
const std::vector<float>& aspect_ratio_,
const std::vector<float>& variance_,
......@@ -82,14 +82,12 @@ void density_prior_box(const lite::Tensor* input,
img_width = image->dims()[3];
img_height = image->dims()[2];
}
float step_w = step_w_;
float step_h = step_h_;
if (step_w == 0 || step_h == 0) {
step_w = static_cast<float>(img_width) / width;
step_h = static_cast<float>(img_height) / height;
}
float offset = offset_;
int step_average = static_cast<int>((step_w + step_h) * 0.5); // add
int channel_size = height * width * prior_num_ * 4;
......@@ -343,7 +341,7 @@ void prior_box(const lite::Tensor* input,
min_size,
std::vector<float>(),
std::vector<float>(),
std::vector<float>(),
std::vector<int>(),
max_size,
aspect_ratio,
variance,
......
......@@ -30,7 +30,7 @@ void density_prior_box(const lite::Tensor* input,
const std::vector<float>& min_size_,
const std::vector<float>& fixed_size_,
const std::vector<float>& fixed_ratio_,
const std::vector<float>& density_size_,
const std::vector<int>& density_size_,
const std::vector<float>& max_size_,
const std::vector<float>& aspect_ratio_,
const std::vector<float>& variance_,
......
......@@ -37,9 +37,36 @@ lite_cc_library(context SRCS context.cc DEPS tensor any cpu_info CL_DEPS cl_cont
else()
lite_cc_library(context SRCS context.cc DEPS tensor any cpu_info eigen3 CL_DEPS cl_context gflags)
endif()
lite_cc_library(kernel SRCS kernel.cc DEPS context type_system target_wrapper any op_params tensor)
#----------------------------------------------- NOT CHANGE -----------------------------------------------
# A trick to generate the paddle_use_kernels.h
add_custom_command(
COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py
${kernels_src_list}
${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h
OUTPUT ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h
)
# A trick to generate the paddle_use_ops.h
add_custom_command(
COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_op_registry.py
${ops_src_list}
${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h
OUTPUT ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h
)
add_custom_target(op_list_h DEPENDS ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h)
add_custom_target(kernel_list_h DEPENDS ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h)
#----------------------------------------------- NOT CHANGE -----------------------------------------------
lite_cc_library(kernel SRCS kernel.cc DEPS context type_system target_wrapper any op_params tensor
)
lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel
cpp_op_desc tensor)
cpp_op_desc tensor
)
add_dependencies(kernel kernel_list_h)
add_dependencies(op op_list_h)
lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper)
lite_cc_library(program SRCS program.cc
......@@ -73,3 +100,17 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils)
lite_cc_test(test_types SRCS types_test.cc DEPS types)
lite_cc_test(test_memory SRCS memory_test.cc DEPS memory)
lite_cc_test(test_context SRCS context_test.cc DEPS context)
# # A trick to generate the paddle_use_kernels.h
# execute_process(
# COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py
# ${kernels_src_list}
# ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h
# )
# # A trick to generate the paddle_use_ops.h
# execute_process(
# COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_op_registry.py
# ${ops_src_list}
# ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h
# )
......@@ -101,7 +101,7 @@ class Context<TargetType::kARM> {
void CopySharedTo(ARMContext* ctx) {}
void SetRunMode(PowerMode mode, int threads) {
void SetRunMode(lite_api::PowerMode mode, int threads) {
return DeviceInfo::Global().SetRunMode(mode, threads);
}
void SetCache(int l1size, int l2size, int l3size) {
......@@ -109,7 +109,7 @@ class Context<TargetType::kARM> {
}
void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
PowerMode mode() const { return DeviceInfo::Global().mode(); }
lite_api::PowerMode mode() const { return DeviceInfo::Global().mode(); }
int threads() const { return DeviceInfo::Global().threads(); }
ARMArch arch() const { return DeviceInfo::Global().arch(); }
int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
......
......@@ -119,7 +119,8 @@ size_t get_mem_size() {
return memsize;
#elif defined(TARGET_IOS)
// to be implemented
printf("not implemented\n");
printf("not implemented, set to default 4GB\n");
return 4096 * 1024;
#endif
return 0;
}
......@@ -209,7 +210,7 @@ void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
}
#elif defined(TARGET_IOS)
for (int i = 0; i < cpu_num; ++i) {
archs->at(i) = APPLE;
archs->at(i) = kAPPLE;
}
#endif
}
......@@ -818,7 +819,7 @@ void DeviceInfo::RequestPowerFullMode(int thread_num) {
active_ids_.push_back(little_core_ids_[i - big_core_size]);
}
}
mode_ = LITE_POWER_FULL;
mode_ = lite_api::PowerMode::LITE_POWER_FULL;
}
void DeviceInfo::RequestPowerHighMode(int thread_num) {
......@@ -826,7 +827,7 @@ void DeviceInfo::RequestPowerHighMode(int thread_num) {
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_HIGH;
mode_ =lite_api::PowerMode::LITE_POWER_HIGH;
if (thread_num > big_core_size) {
LOG(ERROR) << "Request thread num: " << thread_num
<< ", exceed the big cores size: " << big_core_size
......@@ -838,7 +839,7 @@ void DeviceInfo::RequestPowerHighMode(int thread_num) {
}
}
} else {
mode_ = LITE_POWER_LOW;
mode_ = lite_api::PowerMode::LITE_POWER_LOW;
LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
if (thread_num > little_core_size) {
active_ids_ = little_core_ids_;
......@@ -855,7 +856,7 @@ void DeviceInfo::RequestPowerLowMode(int thread_num) {
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (little_core_size > 0) {
mode_ = LITE_POWER_LOW;
mode_ = lite_api::PowerMode::LITE_POWER_LOW;
if (thread_num > little_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the little cores size: " << little_core_size
......@@ -867,7 +868,7 @@ void DeviceInfo::RequestPowerLowMode(int thread_num) {
}
}
} else {
mode_ = LITE_POWER_HIGH;
mode_ = lite_api::PowerMode::LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (thread_num > big_core_size) {
active_ids_ = big_core_ids_;
......@@ -893,7 +894,7 @@ void DeviceInfo::RequestPowerNoBindMode(int thread_num) {
}
}
}
mode_ = LITE_POWER_NO_BIND;
mode_ = lite_api::PowerMode::LITE_POWER_NO_BIND;
}
void DeviceInfo::RequestPowerRandHighMode(int shift_num, int thread_num) {
......@@ -901,7 +902,7 @@ void DeviceInfo::RequestPowerRandHighMode(int shift_num, int thread_num) {
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_RAND_HIGH;
mode_ = lite_api::PowerMode::LITE_POWER_RAND_HIGH;
if (thread_num > big_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the big cores size: " << big_core_size
......@@ -913,7 +914,7 @@ void DeviceInfo::RequestPowerRandHighMode(int shift_num, int thread_num) {
}
}
} else {
mode_ = LITE_POWER_LOW;
mode_ = lite_api::PowerMode::LITE_POWER_LOW;
LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
if (thread_num > little_core_size) {
active_ids_ = little_core_ids_;
......@@ -930,7 +931,7 @@ void DeviceInfo::RequestPowerRandLowMode(int shift_num, int thread_num) {
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (little_core_size > 0) {
mode_ = LITE_POWER_RAND_LOW;
mode_ = lite_api::PowerMode::LITE_POWER_RAND_LOW;
if (thread_num > little_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the little cores size: " << little_core_size
......@@ -943,7 +944,7 @@ void DeviceInfo::RequestPowerRandLowMode(int shift_num, int thread_num) {
}
}
} else {
mode_ = LITE_POWER_HIGH;
mode_ = lite_api::PowerMode::LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
if (thread_num > big_core_size) {
active_ids_ = big_core_ids_;
......@@ -957,6 +958,7 @@ void DeviceInfo::RequestPowerRandLowMode(int shift_num, int thread_num) {
int DeviceInfo::Setup() {
core_num_ = get_cpu_num();
printf("core number: %d\n", core_num_);
mem_size_ = get_mem_size();
get_cpu_arch(&archs_, core_num_);
// set defalut CPU info
......@@ -966,10 +968,10 @@ int DeviceInfo::Setup() {
SetFP32Info(1, 1);
SetFP16Info(1, 0);
SetDotInfo(1, 0);
#ifdef LITE_WITH_LINUX
// get max&min freq
max_freqs_.resize(core_num_);
min_freqs_.resize(core_num_);
#ifdef LITE_WITH_LINUX
// get max&min freq
for (int i = 0; i < core_num_; ++i) {
int max_freq, min_freq;
get_cpu_max_min_freq(i, &max_freq, &min_freq);
......@@ -981,6 +983,30 @@ int DeviceInfo::Setup() {
if (!SetCPUInfoByName()) {
SetCPUInfoByProb();
}
core_ids_.resize(core_num_);
cluster_ids_.resize(core_num_);
for (int i = 0; i < core_num_; ++i) {
max_freqs_[i] = 1000000;
min_freqs_[i] = 1000000;
cluster_ids_[i] = 0;
}
#else
#ifdef TARGET_IOS
dev_name_ = "Apple";
#else
dev_name_ = "Unknown";
#endif
core_ids_.resize(core_num_);
cluster_ids_.resize(core_num_);
big_core_ids_.resize(core_num_);
for (int i = 0; i < core_num_; ++i) {
max_freqs_[i] = 1000000;
min_freqs_[i] = 1000000;
cluster_ids_[i] = 0;
core_ids_[i] = i;
big_core_ids_[i] = i;
}
#endif
// output info
LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
LOG(INFO) << "ARM multiprocessors number: " << core_num_;
......@@ -1004,13 +1030,12 @@ int DeviceInfo::Setup() {
LOG(INFO) << L3_cache_[i] / 1024 << " KB";
}
LOG(INFO) << "Total memory: " << mem_size_ << "KB";
#endif
// set default run mode
SetRunMode(LITE_POWER_NO_BIND, 1); // use single thread by default
SetRunMode(lite_api::PowerMode::LITE_POWER_NO_BIND, 1); // use single thread by default
return 0;
}
void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
#ifdef ARM_WITH_OMP
thread_num = std::min(thread_num, core_num_);
#else
......@@ -1024,22 +1049,22 @@ void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
count_++;
int shift_num = (count_ / 10) % big_core_size;
switch (mode) {
case LITE_POWER_FULL:
case lite_api::LITE_POWER_FULL:
RequestPowerFullMode(thread_num);
break;
case LITE_POWER_HIGH:
case lite_api::LITE_POWER_HIGH:
RequestPowerHighMode(thread_num);
break;
case LITE_POWER_LOW:
case lite_api::LITE_POWER_LOW:
RequestPowerLowMode(thread_num);
break;
case LITE_POWER_NO_BIND:
case lite_api::LITE_POWER_NO_BIND:
RequestPowerNoBindMode(thread_num);
break;
case LITE_POWER_RAND_HIGH:
case lite_api::LITE_POWER_RAND_HIGH:
RequestPowerRandHighMode(shift_num, thread_num);
break;
case LITE_POWER_RAND_LOW:
case lite_api::LITE_POWER_RAND_LOW:
RequestPowerRandLowMode(shift_num, thread_num);
break;
default:
......@@ -1052,12 +1077,12 @@ void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
#ifdef ARM_WITH_OMP
omp_set_num_threads(active_ids_.size());
#endif
if (mode_ != LITE_POWER_NO_BIND) {
if (mode_ != lite_api::LITE_POWER_NO_BIND) {
if (check_cpu_online(active_ids_)) {
bind_threads(active_ids_);
} else {
LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
mode_ = LITE_POWER_NO_BIND;
mode_ = lite_api::LITE_POWER_NO_BIND;
}
}
#else // LITE_WITH_LINUX
......@@ -1080,7 +1105,7 @@ void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
workspace_.Resize({2 * (l1size + l2size)});
}
bool DeviceInfo::ExtendWorkspace(size_t size) {
bool DeviceInfo::ExtendWorkspace(int size) {
workspace_.Resize({size + llc_size()});
workspace_.mutable_data<int8_t>();
return true;
......
......@@ -25,15 +25,6 @@ namespace lite {
#ifdef LITE_WITH_ARM
typedef enum {
LITE_POWER_HIGH = 0,
LITE_POWER_LOW = 1,
LITE_POWER_FULL = 2,
LITE_POWER_NO_BIND = 3,
LITE_POWER_RAND_HIGH = 4,
LITE_POWER_RAND_LOW = 5
} PowerMode;
typedef enum {
kAPPLE = 0,
kA53 = 53,
......@@ -60,11 +51,11 @@ class DeviceInfo {
int Setup();
void SetRunMode(PowerMode mode, int thread_num);
void SetRunMode(lite_api::PowerMode mode, int thread_num);
void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch) { arch_ = arch; }
PowerMode mode() const { return mode_; }
lite_api::PowerMode mode() const { return mode_; }
int threads() const { return active_ids_.size(); }
ARMArch arch() const { return arch_; }
int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
......@@ -82,7 +73,7 @@ class DeviceInfo {
T* workspace_data() {
return reinterpret_cast<T*>(workspace_.mutable_data<int8_t>());
}
bool ExtendWorkspace(size_t size);
bool ExtendWorkspace(int size);
private:
int core_num_;
......@@ -107,7 +98,7 @@ class DeviceInfo {
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
PowerMode mode_;
lite_api::PowerMode mode_;
std::vector<int> active_ids_;
TensorLite workspace_;
int64_t count_{0};
......
......@@ -37,7 +37,7 @@ namespace lite {
namespace mir {
namespace subgraph {
void GenerateNPUProgramPass::NPUSortHelper(
void GenerateNPUProgramPass::SubgraphSortHelper(
Node* node,
const std::unordered_set<Node*>& nodes_all,
std::unordered_set<const Node*>* visited_nodes,
......@@ -46,7 +46,7 @@ void GenerateNPUProgramPass::NPUSortHelper(
if (var_node->inlinks.empty()) continue;
auto* op_node = var_node->inlinks.front();
if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) {
NPUSortHelper(op_node, nodes_all, visited_nodes, ret);
SubgraphSortHelper(op_node, nodes_all, visited_nodes, ret);
}
}
ret->push_back(node);
......@@ -55,40 +55,68 @@ void GenerateNPUProgramPass::NPUSortHelper(
void GenerateNPUProgramPass::CvtOpNodes(
const std::vector<Node*>& nodes2cvt,
std::vector<std::string>* in_vars_name,
std::vector<std::string>* out_vars_name,
lite::npu::bridge::node_map_type* cvted_vars,
std::unordered_set<const Node*>* nodes2rm) {
lite::npu::bridge::node_map_type* cvted_vars) {
const auto& bridges = lite::npu::bridge::Factory::Instance();
const auto& cvtfunc_map = bridges.AllFunctions();
// record all converted vars
// op node's inputs must be found in cvted_vars
for (auto& node : nodes2cvt) {
lite::npu::bridge::node_map_type node_inputs;
auto& stmt = node->AsStmt();
for (auto& var_node : node->inlinks) {
auto& arg = var_node->AsArg();
if (arg.is_weight) continue;
auto var_name = arg.name;
if (!cvted_vars->count(var_name)) {
if (arg.is_weight) continue;
cvted_vars->insert(std::make_pair(
var_name,
lite::npu::bridge::CvtNode(var_node, stmt.op()->scope())));
in_vars_name->push_back(var_name);
}
node_inputs.insert(*cvted_vars->find(var_name));
}
auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs);
cvted_vars->insert(node_outputs.begin(), node_outputs.end());
nodes2rm->insert(node);
for (auto& var_node : node->outlinks) {
for (auto& next_op_node : var_node->outlinks) {
if (std::find(nodes2cvt.begin(), nodes2cvt.end(), next_op_node) ==
nodes2cvt.end()) {
out_vars_name->push_back(var_node->AsArg().name);
break;
}
}
}
void GenerateNPUProgramPass::GetIOVars(
const std::vector<Node*>& nodes2cvt,
const lite::npu::bridge::node_map_type& cvted_vars,
std::unordered_set<const Node*>* nodes2rm,
std::vector<Node*>* in_vars,
std::vector<Node*>* out_vars,
lite::npu::bridge::node_map_type* in_cvted_vars,
lite::npu::bridge::node_map_type* out_cvted_vars) {
std::unordered_set<Node*> op_nodes_all(nodes2cvt.begin(), nodes2cvt.end());
for (auto& op_node : nodes2cvt) {
for (auto& in_var : op_node->inlinks) {
if (in_var->AsArg().is_weight) continue;
auto* pre_op_node = in_var->inlinks.front();
if (op_nodes_all.count(pre_op_node)) {
nodes2rm->insert(in_var);
continue;
}
in_vars->push_back(in_var);
auto arg_name = in_var->AsArg().name;
in_cvted_vars->insert(std::make_pair(arg_name, cvted_vars.at(arg_name)));
}
for (auto& out_var : op_node->outlinks) {
if (out_var->outlinks.empty()) {
nodes2rm->insert(out_var);
continue;
}
auto* next_op_node = out_var->outlinks.front();
if (op_nodes_all.count(next_op_node)) {
nodes2rm->insert(out_var);
continue;
}
out_vars->push_back(out_var);
auto arg_name = out_var->AsArg().name;
out_cvted_vars->insert(std::make_pair(arg_name, cvted_vars.at(arg_name)));
}
}
nodes2rm->insert(nodes2cvt.begin(), nodes2cvt.end());
}
void GenerateNPUProgramPass::GenNPUGraphOpNode(
......@@ -100,23 +128,38 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode(
for (auto& node : nodes_all) {
if (!node->IsStmt()) continue;
if (visited_nodes.count(node)) continue;
NPUSortHelper(node, nodes_all, &visited_nodes, &ret);
SubgraphSortHelper(node, nodes_all, &visited_nodes, &ret);
}
std::vector<std::string> in_vars_name;
std::vector<std::string> out_vars_name;
lite::npu::bridge::node_map_type cvted_vars;
CvtOpNodes(ret, &cvted_vars);
std::unordered_set<const Node*> nodes2rm;
CvtOpNodes(ret, &in_vars_name, &out_vars_name, &cvted_vars, &nodes2rm);
// insert new graph op node
std::vector<Node*> in_vars;
std::vector<Node*> out_vars;
lite::npu::bridge::node_map_type in_cvted_vars;
lite::npu::bridge::node_map_type out_cvted_vars;
GetIOVars(ret,
cvted_vars,
&nodes2rm,
&in_vars,
&out_vars,
&in_cvted_vars,
&out_cvted_vars);
std::vector<std::string> in_vars_name;
std::vector<std::string> out_vars_name;
std::vector<ge::Operator> inputs;
std::vector<ge::Operator> outputs;
for (auto i : in_vars_name) {
inputs.push_back(*cvted_vars.at(i));
for (auto i : in_cvted_vars) {
in_vars_name.push_back(i.first);
inputs.push_back(*i.second);
}
for (auto i : out_vars_name) {
outputs.push_back(*cvted_vars.at(i));
for (auto i : out_cvted_vars) {
out_vars_name.push_back(i.first);
outputs.push_back(*i.second);
}
std::string model_name("hiai_npu_client_" + std::to_string(sub_id) + ".om");
if (!npu::BuildNPUClient(inputs, outputs, model_name)) {
LOG(FATAL) << "Build NPU failed subgraph " << sub_id;
......@@ -125,27 +168,25 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode(
cpp::OpDesc op_desc;
op_desc.SetType("graph_op");
std::vector<std::string> in_var_names;
op_desc.SetInput("Inputs", in_vars_name);
op_desc.SetOutput("Outputs", out_vars_name);
op_desc.SetAttr("model_name", model_name);
auto graph_op = LiteOpRegistry::Global().Create("graph_op");
// TODO(zpy): support multi inputs op
auto start_op = ret.front()->AsStmt().op();
auto* scope = start_op->scope();
auto any_op = ret.front()->AsStmt().op();
auto* scope = any_op->scope();
graph_op->Attach(op_desc, scope);
auto valid_places = start_op->valid_places();
auto valid_places = any_op->valid_places();
auto* new_op_node = graph->GraphCreateInstructNode(graph_op, valid_places);
for (auto& var_node : ret.front()->inlinks) {
auto& arg = var_node->AsArg();
if (arg.is_weight) continue;
IR_NODE_LINK_TO(var_node, new_op_node);
for (auto& in_var : in_vars) {
IR_NODE_LINK_TO(in_var, new_op_node);
}
for (auto& var_node : ret.back()->outlinks) {
auto& arg = var_node->AsArg();
if (arg.is_weight) continue;
IR_NODE_LINK_TO(var_node, new_op_node);
for (auto& out_var : out_vars) {
IR_OP_VAR_LINK(new_op_node, out_var);
}
// assign context
......@@ -159,8 +200,10 @@ void GenerateNPUProgramPass::GenNPUGraphOpNode(
void GenerateNPUProgramPass::ConvertSubgraph(
const std::unique_ptr<SSAGraph>& graph, int sub_num) {
std::unordered_map<int, std::unordered_set<Node*>> nodes_all;
int ops_num = 0;
for (auto& item : graph->StmtTopologicalOrder()) {
if (!item->IsStmt()) continue;
ops_num++;
auto& stmt = item->AsStmt();
int sub_id = stmt.subgraph_id();
if (sub_id < 1) continue;
......@@ -178,6 +221,7 @@ void GenerateNPUProgramPass::ConvertSubgraph(
void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
LOG(INFO) << "Before NPU Pass \n" << Visualize(graph.get());
const auto& bridges = lite::npu::bridge::Factory::Instance();
const auto& op_map = bridges.AllFunctions();
std::vector<std::string> supported_op_types;
......@@ -215,5 +259,3 @@ std::unique_ptr<RuntimeProgram> GenerateNPUProgramPass::GenProgram() {
REGISTER_MIR_PASS(generate_npu_program_pass,
paddle::lite::mir::subgraph::GenerateNPUProgramPass);
// USE_LITE_OP(graph_op);
......@@ -38,21 +38,27 @@ class GenerateNPUProgramPass : public SubgraphProgramPass {
std::unique_ptr<RuntimeProgram> GenProgram();
protected:
void NPUSortHelper(Node* node,
const std::unordered_set<Node*>& nodes_all,
std::unordered_set<const Node*>* visited_nodes,
std::vector<Node*>* ret);
// sort nodes to operational sequence
void SubgraphSortHelper(Node* node,
const std::unordered_set<Node*>& nodes_all,
std::unordered_set<const Node*>* visited_nodes,
std::vector<Node*>* ret);
// nodes2cvt: op nodes to convert
// in_vars_name: graph op's inputs var name
// out_vars_name: graph op's outputs var name
// vcted_vars:
// cvted_vars: converted var nodes
// nodes2rm: op nodes and var nodes that need to be removed
void CvtOpNodes(const std::vector<Node*>& nodes2cvt,
std::vector<std::string>* in_vars_name,
std::vector<std::string>* out_vars_name,
lite::npu::bridge::node_map_type* cvted_vars,
std::unordered_set<const Node*>* nodes2rm);
lite::npu::bridge::node_map_type* cvted_vars);
// achieve input and output vars/cvted_vars;
// achieve all nodes to remove
void GetIOVars(const std::vector<Node*>& nodes2cvt,
const lite::npu::bridge::node_map_type& cvted_vars,
std::unordered_set<const Node*>* nodes2rm,
std::vector<Node*>* in_vars,
std::vector<Node*>* out_vars,
lite::npu::bridge::node_map_type* in_cvted_vars,
lite::npu::bridge::node_map_type* out_cvted_vars);
void GenNPUGraphOpNode(const std::unique_ptr<SSAGraph>& graph,
int sub_id,
......
......@@ -26,17 +26,49 @@ namespace paddle {
namespace lite {
namespace profile {
template <typename dtype>
static void write_tensorfile(const Tensor* tensor, const std::string& locate) {
if (locate.find('/') != std::string::npos) {
return;
}
FILE* fp = fopen(locate.c_str(), "w");
if (fp == nullptr) {
LOG(ERROR) << "file open field " << locate;
} else {
const dtype* data = tensor->data<dtype>();
for (int i = 0; i < tensor->numel(); ++i) {
fprintf(fp, "[%d] %f \n", i, static_cast<float>(data[i]));
}
}
fclose(fp);
}
class PrecisionProfiler {
public:
explicit PrecisionProfiler(const Instruction* inst) : inst_(inst) {}
~PrecisionProfiler() {
LOG(INFO) << ">> Running kernel: " << inst_->op()->op_info()->Repr()
<< " on Target " << TargetToStr(inst_->kernel()->target());
auto tensor_mean = [](const Tensor* in, PrecisionType ptype) -> double {
<< " on Target " << TargetToStr(inst_->kernel()->target()) << " "
<< PrecisionToStr(inst_->kernel()->precision());
auto tensor_mean = [](const Tensor* in,
PrecisionType ptype,
std::string name = "inst") -> double {
if (!in->data<int8_t>()) {
return -99999;
}
double sum = 0.;
switch (ptype) {
case PRECISION(kFloat): {
auto ptr = in->data<float>();
// write_tensorfile<float>(in, name);
for (int i = 0; i < in->numel(); ++i) {
sum += ptr[i];
}
return sum / in->numel();
}
case PRECISION(kAny): {
auto ptr = in->data<float>();
// write_tensorfile<float>(in, name);
for (int i = 0; i < in->numel(); ++i) {
sum += ptr[i];
}
......@@ -44,6 +76,7 @@ class PrecisionProfiler {
}
case PRECISION(kInt8): {
auto ptr = in->data<int8_t>();
// write_tensorfile<int8_t>(in, name);
for (int i = 0; i < in->numel(); ++i) {
sum += ptr[i];
}
......@@ -51,6 +84,7 @@ class PrecisionProfiler {
}
case PRECISION(kInt32): {
auto ptr = in->data<int32_t>();
// write_tensorfile<int32_t>(in, name);
for (int i = 0; i < in->numel(); ++i) {
sum += ptr[i];
}
......@@ -70,17 +104,18 @@ class PrecisionProfiler {
std::string out_arg_name;
op->op_info()->GetOutputArgname(out_name, &out_arg_name);
auto type = kernel->GetOutputDeclType(out_arg_name);
if (type->IsTensor()) {
auto tout = op_scope->FindVar(out_name)->GetMutable<Tensor>();
double mean = tensor_mean(tout, type->precision());
double mean = tensor_mean(tout, type->precision(), out_name);
LOG(INFO) << "output name: " << out_name << ", dims: " << tout->dims()
<< ", precision: " << PrecisionToStr(type->precision())
<< ", mean value: " << mean;
<< ", mean value: " << mean << " shape:" << tout->dims();
} else if (type->IsTensorList()) {
auto tout =
op_scope->FindVar(out_name)->GetMutable<std::vector<Tensor>>();
for (auto& t : *tout) {
double mean = tensor_mean(&t, type->precision());
double mean = tensor_mean(&t, type->precision(), out_name);
LOG(INFO) << "output name: " << out_name << ", dims: " << t.dims()
<< ", precision: " << PrecisionToStr(type->precision())
<< ", mean value: " << mean;
......
@if "%DEBUG%" == "" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
@rem
@rem ##########################################################################
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal
set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS=
@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome
set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if "%ERRORLEVEL%" == "0" goto init
echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:findJavaFromJavaHome
set JAVA_HOME=%JAVA_HOME:"=%
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
if exist "%JAVA_EXE%" goto init
echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:init
@rem Get command-line arguments, handling Windows variants
if not "%OS%" == "Windows_NT" goto win9xME_args
:win9xME_args
@rem Slurp the command line arguments.
set CMD_LINE_ARGS=
set _SKIP=2
:win9xME_args_slurp
if "x%~1" == "x" goto execute
set CMD_LINE_ARGS=%*
:execute
@rem Setup the command line
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
@rem Execute Gradle
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
:end
@rem End local scope for the variables with windows NT shell
if "%ERRORLEVEL%"=="0" goto mainEnd
:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
exit /b 1
:mainEnd
if "%OS%"=="Windows_NT" endlocal
:omega
@if "%DEBUG%" == "" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
@rem
@rem ##########################################################################
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal
set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS=
@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome
set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if "%ERRORLEVEL%" == "0" goto init
echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:findJavaFromJavaHome
set JAVA_HOME=%JAVA_HOME:"=%
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
if exist "%JAVA_EXE%" goto init
echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:init
@rem Get command-line arguments, handling Windows variants
if not "%OS%" == "Windows_NT" goto win9xME_args
:win9xME_args
@rem Slurp the command line arguments.
set CMD_LINE_ARGS=
set _SKIP=2
:win9xME_args_slurp
if "x%~1" == "x" goto execute
set CMD_LINE_ARGS=%*
:execute
@rem Setup the command line
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
@rem Execute Gradle
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
:end
@rem End local scope for the variables with windows NT shell
if "%ERRORLEVEL%"=="0" goto mainEnd
:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
exit /b 1
:mainEnd
if "%OS%"=="Windows_NT" endlocal
:omega
message(STATUS "add lite kernels")
set(lite_kernel_deps type_system kernel op op_registry context tensor CACHE INTERNAL "" FORCE)
set(lite_kernel_deps type_system kernel op op_registry context tensor any CACHE INTERNAL "" FORCE)
add_subdirectory(host)
add_subdirectory(arm)
......
......@@ -4,64 +4,66 @@ endif()
message(STATUS "compile with lite ARM kernels")
lite_cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(activation_compute_arm SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(matmul_compute_arm SRCS matmul_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(elementwise_compute_arm SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(lrn_compute_arm SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(decode_bboxes_compute_arm SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(multiclass_nms_compute_arm SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(concat_compute_arm SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(pad2d_compute_arm SRCS pad2d_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(prior_box_compute_arm SRCS prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(density_prior_box_compute_arm SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(negative_compute_arm SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(crop_compute_arm SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(dropout_compute_arm SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(calib_compute_arm SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(transpose_compute_arm SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(power_compute_arm SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(yolo_box_compute_arm SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(shuffle_channel_compute_arm SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(argmax_compute_arm SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(axpy_compute_arm SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(conv_transpose_compute_arm SRCS conv_transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(gru_unit_compute_arm SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(gru_compute_arm SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(beam_search_decode_compute_arm SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(lookup_table_compute_arm SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(im2sequence_compute_arm SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(sequence_softmax_compute_arm SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(norm_compute_arm SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(interpolate_compute_arm SRCS interpolate_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(logical_compute_arm SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(less_than_arm SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(while_compute_arm SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(compare_compute_arm SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(topk_compute_arm SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(increment_compute_arm SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(write_to_array_compute_arm SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(read_from_array_compute_arm SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(beam_search_compute_arm SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(fill_constant_compute_arm SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(lod_reset_compute_arm SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(box_coder_compute_arm SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(sequence_pool_compute_arm SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(sequence_expand_compute_arm SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(reduce_max_compute_arm SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(is_empty_compute_arm SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(shape_compute_arm SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(slice_compute_arm SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(cast_compute_arm SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(squeeze_compute_arm SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_library(expand_compute_arm SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(fc_compute_arm ARM basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(activation_compute_arm ARM basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(mul_compute_arm ARM basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(matmul_compute_arm ARM basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(scale_compute_arm ARM basic SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(softmax_compute_arm ARM basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(conv_compute_arm ARM basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(batch_norm_compute_arm ARM basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(elementwise_compute_arm ARM basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(lrn_compute_arm ARM basic SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(decode_bboxes_compute_arm ARM basic SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(multiclass_nms_compute_arm ARM basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(pool_compute_arm ARM basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(split_compute_arm ARM basic SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(concat_compute_arm ARM basic SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(pad2d_compute_arm ARM basic SRCS pad2d_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(prior_box_compute_arm ARM basic SRCS prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(negative_compute_arm ARM basic SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(crop_compute_arm ARM basic SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(calib_compute_arm ARM basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(transpose_compute_arm ARM basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(power_compute_arm ARM basic SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(yolo_box_compute_arm ARM basic SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(shuffle_channel_compute_arm ARM basic SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(axpy_compute_arm ARM basic SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(conv_transpose_compute_arm ARM basic SRCS conv_transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(norm_compute_arm ARM basic SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(interpolate_compute_arm ARM basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(box_coder_compute_arm ARM basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(shape_compute_arm ARM basic SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(reduce_max_compute_arm ARM basic SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(sequence_expand_compute_arm ARM basic SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
# for OCR specific
add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(sequence_pool_compute_arm ARM extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(less_than_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(fill_constant_compute_arm ARM extra SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)
lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
......@@ -77,71 +79,7 @@ lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm)
lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm)
lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra)
lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm)
lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm)
set(arm_kernels
fc_compute_arm
activation_compute_arm
mul_compute_arm
matmul_compute_arm
scale_compute_arm
softmax_compute_arm
conv_compute_arm
batch_norm_compute_arm
elementwise_compute_arm
lrn_compute_arm
decode_bboxes_compute_arm
multiclass_nms_compute_arm
pool_compute_arm
split_compute_arm
concat_compute_arm
pad2d_compute_arm
prior_box_compute_arm
density_prior_box_compute_arm
negative_compute_arm
crop_compute_arm
dropout_compute_arm
transpose_compute_arm
calib_compute_arm
argmax_compute_arm
axpy_compute_arm
conv_transpose_compute_arm
gru_unit_compute_arm
gru_compute_arm
beam_search_decode_compute_arm
lookup_table_compute_arm
im2sequence_compute_arm
sequence_softmax_compute_arm
norm_compute_arm
power_compute_arm
shuffle_channel_compute_arm
yolo_box_compute_arm
interpolate_compute_arm
logical_compute_arm
less_than_arm
while_compute_arm
compare_compute_arm
topk_compute_arm
increment_compute_arm
write_to_array_compute_arm
read_from_array_compute_arm
beam_search_compute_arm
fill_constant_compute_arm
lod_reset_compute_arm
box_coder_compute_arm
reduce_max_compute_arm
sequence_expand_compute_arm
sequence_pool_compute_arm
is_empty_compute_arm
shape_compute_arm
slice_compute_arm
cast_compute_arm
squeeze_compute_arm
expand_compute_arm
)
set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
......@@ -48,13 +48,12 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
void DensityPriorBoxCompute::Run() {
auto& param = Param<operators::DensityPriorBoxParam>();
bool is_flip = param.flip;
bool is_clip = param.clip;
std::vector<float> min_size = param.min_sizes;
std::vector<float> fixed_size = param.fixed_sizes;
std::vector<float> fixed_ratio = param.fixed_ratios;
std::vector<float> density_size = param.density_sizes;
auto density_size = param.density_sizes;
std::vector<float> max_size = param.max_sizes;
std::vector<float> aspect_ratio = param.aspect_ratios;
std::vector<float> variance = param.variances_;
......
if (NOT LITE_WITH_FPGA)
return()
endif()
message("fpga : ${lite_kernel_deps}")
set(fpga_deps fpga_target_wrapper kernel_fpga)
lite_cc_library(activation_compute_fpga SRCS activation_compute.cc DEPS ${fpga_deps})
add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps})
lite_cc_library(conv_compute_fpga SRCS conv_compute.cc DEPS ${fpga_deps})
add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps})
lite_cc_library(elementwise_compute_fpga SRCS elementwise_compute.cc DEPS ${fpga_deps})
add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps})
lite_cc_library(pooling_compute_fpga SRCS pooling_compute.cc DEPS ${fpga_deps})
add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps})
lite_cc_library(scale_compute_fpga SRCS scale_compute.cc DEPS ${fpga_deps})
add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
lite_cc_library(softmax_compute_fpga SRCS softmax_compute.cc DEPS ${fpga_deps})
add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps})
lite_cc_library(fc_compute_fpga SRCS fc_compute.cc DEPS ${fpga_deps})
add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps})
lite_cc_library(io_copy_compute_fpga SRCS io_copy_compute.cc DEPS ${fpga_deps})
lite_cc_library(calib_compute_fpga SRCS calib_compute.cc DEPS ${fpga_deps})
lite_cc_library(layout_compute_fpga SRCS layout_compute.cc DEPS ${fpga_deps})
lite_cc_library(feed_compute_fpga SRCS feed_compute.cc DEPS ${fpga_deps})
lite_cc_library(fetch_compute_fpga SRCS fetch_compute.cc DEPS ${fpga_deps})
set (fpga_kernels
activation_compute_fpga
conv_compute_fpga
elementwise_compute_fpga
pooling_compute_fpga
scale_compute_fpga
softmax_compute_fpga
fc_compute_fpga
io_copy_compute_fpga
calib_compute_fpga
layout_compute_fpga
feed_compute_fpga
fetch_compute_fpga
)
set(fpga_kernels "${fpga_kernels}" CACHE INTERNAL "fpga kernels")
add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps})
add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps})
add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_deps})
add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
message(STATUS "compile with lite host kernels")
lite_cc_library(feed_compute_host SRCS feed_compute.cc DEPS ${lite_kernel_deps})
lite_cc_library(fetch_compute_host SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
lite_cc_library(reshape_compute_host SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host)
set(host_kernels
feed_compute_host
fetch_compute_host
reshape_compute_host
)
set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels")
lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
......@@ -93,3 +93,40 @@ REGISTER_LITE_KERNEL(reshape2,
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
.Finalize();
REGISTER_LITE_KERNEL(flatten,
kHost,
kAny,
kAny,
paddle::lite::kernels::host::ReshapeCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
.BindInput("Shape",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
.BindOutput("Out",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
.Finalize();
REGISTER_LITE_KERNEL(flatten2,
kHost,
kAny,
kAny,
paddle::lite::kernels::host::ReshapeCompute,
def)
.BindInput("X",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
.BindInput("Shape",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
.BindOutput("Out",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
.BindOutput("XShape",
{LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
.Finalize();
......@@ -2,12 +2,8 @@
if(NOT LITE_WITH_NPU)
return ()
endif()
message(STATUS "compile with lite NPU kernels")
lite_cc_library(graph_compute_npu SRCS graph_compute.cc DEPS ${lite_kernel_deps} ${npu_ddk_libs})
add_kernel(graph_compute_npu NPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} ${npu_ddk_libs})
# lite_cc_test(test_graph_compute_npu SRCS graph_compute_test.cc DEPS graph_compute_npu)
set(npu_kernels graph_compute_npu)
set(npu_kernels "${npu_kernels}" CACHE INTERNAL "npu kernels")
......@@ -4,17 +4,17 @@ endif()
set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper)
lite_cc_library(fc_opencl SRCS fc_compute.cc DEPS ${cl_kernel_deps})
lite_cc_library(mul_opencl SRCS mul_compute.cc DEPS ${cl_kernel_deps})
lite_cc_library(elementwise_add_opencl SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
lite_cc_library(fusion_elementwise_add_activation_opencl
SRCS fusion_elementwise_add_activation_compute.cc
add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps})
add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps})
add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
add_kernel(fusion_elementwise_add_activation_opencl
OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc
DEPS elementwise_add_opencl ${cl_kernel_deps})
lite_cc_library(pool_opencl SRCS pool_compute.cc DEPS ${cl_kernel_deps})
lite_cc_library(io_copy_compute_opencl SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
lite_cc_library(relu_opencl SRCS relu_compute.cc DEPS ${cl_kernel_deps})
lite_cc_library(depthwise_conv2d_opencl SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
lite_cc_library(conv_opencl SRCS conv_compute.cc DEPS ${cl_kernel_deps})
add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps})
lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
......@@ -47,15 +47,3 @@ lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
DEPS conv_opencl op_registry program context
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/opencl)
set(opencl_kernels
io_copy_compute_opencl
elementwise_add_opencl
fusion_elementwise_add_activation_opencl
pool_opencl
relu_opencl
mul_opencl
fc_opencl
depthwise_conv2d_opencl
conv_opencl
CACHE INTERNAL "opencl_kernels")
......@@ -10,7 +10,7 @@ endif()
# lite_cc_library(fc_compute_x86 SRCS fc_compute.cc DEPS ${lite_kernel_deps})
# lite_cc_library(mul_compute_x86 SRCS mul_compute.cc DEPS ${lite_kernel_deps})
# lite_cc_library(relu_compute_x86 SRCS relu_compute.cc DEPS ${lite_kernel_deps})
lite_cc_library(scale_compute_x86 SRCS scale_compute.cc DEPS ${lite_kernel_deps})
add_kernel(scale_compute_x86 X86 basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
# lite_cc_library(elementwise_compute_x86 SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} elementwise_sub_op elementwise_add_op)
# lite_cc_library(softmax_compute_x86 SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
# lite_cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} )
......@@ -31,23 +31,3 @@ lite_cc_library(scale_compute_x86 SRCS scale_compute.cc DEPS ${lite_kernel_deps}
# lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
# lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
# lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
set(x86_kernels
# activation_compute_x86
# elementwise_compute_x86
# mean_compute_x86
# fill_constant_compute_x86
# mul_compute_x86
# relu_compute_x86
# fc_compute_x86
scale_compute_x86
# softmax_compute_x86
# dropout_compute_x86
# concat_compute_x86
# conv_compute_x86
# pool_compute_x86
# batch_norm_compute_x86
# uniform_random_compute_x86
# sgd_compute_x86
CACHE INTERNAL "x86 kernels")
......@@ -30,12 +30,14 @@ namespace bridge {
node_map_type BatchNormConverter(
const std::shared_ptr<lite::OpLite> batch_norm_op,
const node_map_type& inputs_map) {
LOG(INFO) << "converting batchnorm...";
lite::Scope* scope = batch_norm_op->scope();
const lite::OpInfo* op_info = batch_norm_op->op_info();
auto scope = batch_norm_op->scope();
auto op_info = batch_norm_op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "...";
std::shared_ptr<ge::op::BatchNorm> output_node =
std::make_shared<ge::op::BatchNorm>(UniqueName("batch_norm"));
std::shared_ptr<ge::op::BatchNorm> batch_norm_node =
std::make_shared<ge::op::BatchNorm>(unique_op_type);
auto x_var_name = op_info->Input("X").front();
auto scale_var_name = op_info->Input("Scale").front();
......@@ -68,21 +70,21 @@ node_map_type BatchNormConverter(
int npu_mode = 1; // bnScale, bnBias tensor dims are 1xCx1x1
bool npu_use_global_stats = op_info->GetAttr<bool>("use_global_stats");
output_node->set_input_x(*inputs_map.at(x_var_name));
output_node->set_input_scale(*npu_scale);
output_node->set_input_b(*npu_bias);
output_node->set_input_mean(*npu_mean);
output_node->set_input_variance(*npu_variance);
output_node->set_attr_momentum(npu_momentum);
output_node->set_attr_epsilon(npu_epsilon);
output_node->set_attr_mode(npu_mode);
output_node->set_attr_use_global_stats(npu_use_global_stats);
batch_norm_node->set_input_x(*inputs_map.at(x_var_name));
batch_norm_node->set_input_scale(*npu_scale);
batch_norm_node->set_input_b(*npu_bias);
batch_norm_node->set_input_mean(*npu_mean);
batch_norm_node->set_input_variance(*npu_variance);
batch_norm_node->set_attr_momentum(npu_momentum);
batch_norm_node->set_attr_epsilon(npu_epsilon);
batch_norm_node->set_attr_mode(npu_mode);
batch_norm_node->set_attr_use_global_stats(npu_use_global_stats);
OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(output_node);
OpList::Global().add(batch_norm_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Y").front()] = output_node;
outputs_map[op_info->Output("Y").front()] = batch_norm_node;
return outputs_map;
}
......
......@@ -30,11 +30,14 @@ namespace bridge {
node_map_type ElementwiseConverter(
const std::shared_ptr<lite::OpLite> elementwise_op,
const node_map_type& inputs_map) {
auto scope = elementwise_op->scope();
auto op_info = elementwise_op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type);
LOG(INFO) << "converting elementwise...";
lite::Scope* scope = elementwise_op->scope();
const lite::OpInfo* op_info = elementwise_op->op_info();
std::shared_ptr<ge::op::Eltwise> output_node =
std::make_shared<ge::op::Eltwise>(UniqueName("elementwise"));
std::shared_ptr<ge::op::Eltwise> elementwise_node =
std::make_shared<ge::op::Eltwise>(unique_op_type);
auto x_var_name = op_info->Input("X").front();
auto y_var_name = op_info->Input("Y").front();
......@@ -43,27 +46,27 @@ node_map_type ElementwiseConverter(
<< "npu elementwise only support inputs with same size";
CHECK(inputs_map.find(x_var_name) != inputs_map.end());
output_node->set_input_x1(*inputs_map.at(x_var_name));
elementwise_node->set_input_x1(*inputs_map.at(x_var_name));
OpList::Global().add(inputs_map.at(x_var_name));
if (inputs_map.find(y_var_name) != inputs_map.end()) {
output_node->set_input_x2(*inputs_map.at(y_var_name));
elementwise_node->set_input_x2(*inputs_map.at(y_var_name));
OpList::Global().add(inputs_map.at(y_var_name));
} else {
auto consty = std::make_shared<ge::op::Const>(y_var_name);
auto* y = scope->FindVar(y_var_name)->GetMutable<Tensor>();
consty->set_attr_value(CvtFromLiteTensor(y));
output_node->set_input_x2(*consty);
elementwise_node->set_input_x2(*consty);
OpList::Global().add(consty);
}
OpList::Global().add(output_node);
OpList::Global().add(elementwise_node);
// paddlelite has sum only
output_node->set_attr_mode(1);
elementwise_node->set_attr_mode(1);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = output_node;
outputs_map[op_info->Output("Out").front()] = elementwise_node;
return outputs_map;
}
......
......@@ -29,12 +29,14 @@ namespace bridge {
node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
const node_map_type& inputs_map) {
LOG(INFO) << "converting pool...";
lite::Scope* scope = pool_op->scope();
const lite::OpInfo* op_info = pool_op->op_info();
auto scope = pool_op->scope();
auto op_info = pool_op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "...";
std::shared_ptr<ge::op::Pooling> output_node =
std::make_shared<ge::op::Pooling>(UniqueName("pool"));
std::shared_ptr<ge::op::Pooling> pool_node =
std::make_shared<ge::op::Pooling>(unique_op_type);
auto x_var_name = op_info->Input("X").front();
auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
int npu_mode = 0;
......@@ -61,21 +63,21 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
npu_ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
}
output_node->set_input_x(*inputs_map.at(x_var_name));
output_node->set_attr_mode(npu_mode);
output_node->set_attr_pad_mode(0);
output_node->set_attr_global_pooling(npu_global_pooling);
output_node->set_attr_window(npu_window);
output_node->set_attr_pad(npu_pad);
output_node->set_attr_stride(npu_stride);
output_node->set_attr_ceil_mode(npu_ceil_mode);
pool_node->set_input_x(*inputs_map.at(x_var_name));
pool_node->set_attr_mode(npu_mode);
pool_node->set_attr_pad_mode(0);
pool_node->set_attr_global_pooling(npu_global_pooling);
pool_node->set_attr_window(npu_window);
pool_node->set_attr_pad(npu_pad);
pool_node->set_attr_stride(npu_stride);
pool_node->set_attr_ceil_mode(npu_ceil_mode);
// output_node->set_attr_data_mode(npu_data_mode);
OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(output_node);
OpList::Global().add(pool_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = output_node;
outputs_map[op_info->Output("Out").front()] = pool_node;
return outputs_map;
}
......
......@@ -30,22 +30,24 @@ namespace bridge {
node_map_type ShuffleChannelConverter(
const std::shared_ptr<lite::OpLite> shuffle_channel_op,
const node_map_type& inputs_map) {
LOG(INFO) << "converting shuffle_channel...";
lite::Scope* scope = shuffle_channel_op->scope();
const lite::OpInfo* op_info = shuffle_channel_op->op_info();
std::shared_ptr<ge::op::ShuffleChannel> output_node =
std::make_shared<ge::op::ShuffleChannel>(UniqueName("shuffle_channel"));
auto scope = shuffle_channel_op->scope();
auto op_info = shuffle_channel_op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "...";
std::shared_ptr<ge::op::ShuffleChannel> shuffle_channel_node =
std::make_shared<ge::op::ShuffleChannel>(unique_op_type);
auto x_var_name = op_info->Input("X").front();
output_node->set_input_x(*inputs_map.at(x_var_name));
output_node->set_attr_group(op_info->GetAttr<int>("group"));
shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name));
shuffle_channel_node->set_attr_group(op_info->GetAttr<int>("group"));
OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(output_node);
OpList::Global().add(shuffle_channel_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = output_node;
outputs_map[op_info->Output("Out").front()] = shuffle_channel_node;
return outputs_map;
}
......
......@@ -29,12 +29,14 @@ namespace bridge {
node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
const node_map_type& inputs_map) {
LOG(INFO) << "converting softmax...";
lite::Scope* scope = softmax_op->scope();
const lite::OpInfo* op_info = softmax_op->op_info();
auto scope = softmax_op->scope();
auto op_info = softmax_op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "...";
std::shared_ptr<ge::op::Softmax> output_node =
std::make_shared<ge::op::Softmax>(UniqueName("softmax"));
std::shared_ptr<ge::op::Softmax> softmax_node =
std::make_shared<ge::op::Softmax>(unique_op_type);
auto x_var_name = op_info->Input("X").front();
auto x_dims = scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims();
......@@ -46,14 +48,14 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
}
CHECK(inputs_map.count(x_var_name));
output_node->set_input_x(*inputs_map.at(x_var_name));
output_node->set_attr_axis(axis);
softmax_node->set_input_x(*inputs_map.at(x_var_name));
softmax_node->set_attr_axis(axis);
OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(output_node);
OpList::Global().add(softmax_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = output_node;
outputs_map[op_info->Output("Out").front()] = softmax_node;
return outputs_map;
}
......
......@@ -30,19 +30,21 @@ namespace bridge {
node_map_type TransposeConverter(
const std::shared_ptr<lite::OpLite> transpose_op,
const node_map_type& inputs_map) {
LOG(INFO) << "converting transpose...";
lite::Scope* scope = transpose_op->scope();
const lite::OpInfo* op_info = transpose_op->op_info();
auto scope = transpose_op->scope();
auto op_info = transpose_op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "...";
std::shared_ptr<ge::op::Permute> output_node =
std::make_shared<ge::op::Permute>(UniqueName("transpose"));
std::shared_ptr<ge::op::Permute> transpose_node =
std::make_shared<ge::op::Permute>(unique_op_type);
auto x_var_name = op_info->Input("X").front();
// paddlelite doesn't have this input
// w must be set, but it does nothing
auto w_var_name = "transpose_w";
auto w_var_name = unique_op_type + "/w";
auto* w = scope->Var(w_var_name)->GetMutable<Tensor>();
w->Resize(scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims());
w->Resize({1});
auto* w_data = w->mutable_data<float>();
for (int i = 0; i < w->numel(); i++) {
w_data[i] = 1.f;
......@@ -55,15 +57,15 @@ node_map_type TransposeConverter(
auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end());
CHECK(inputs_map.count(x_var_name));
output_node->set_input_x(*inputs_map.at(x_var_name));
output_node->set_input_w(*npu_w);
output_node->set_attr_order(npu_axis);
transpose_node->set_input_x(*inputs_map.at(x_var_name));
transpose_node->set_input_w(*npu_w);
transpose_node->set_attr_order(npu_axis);
OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(output_node);
OpList::Global().add(transpose_node);
node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = output_node;
outputs_map[op_info->Output("Out").front()] = transpose_node;
return outputs_map;
}
......
set(op_DEPS tensor op op_params)
lite_cc_library(conv_op SRCS conv_op.cc DEPS ${op_DEPS})
lite_cc_library(pool_op SRCS pool_op.cc DEPS ${op_DEPS})
lite_cc_library(fc_op SRCS fc_op.cc DEPS ${op_DEPS})
lite_cc_library(relu_op SRCS relu_op.cc DEPS ${op_DEPS})
lite_cc_library(mul_op SRCS mul_op.cc DEPS ${op_DEPS})
lite_cc_library(matmul_op SRCS matmul_op.cc DEPS ${op_DEPS})
lite_cc_library(scale_op SRCS scale_op.cc DEPS ${op_DEPS})
lite_cc_library(softmax_op SRCS softmax_op.cc DEPS ${op_DEPS})
lite_cc_library(reshape_op SRCS reshape_op.cc DEPS ${op_DEPS} )
lite_cc_library(batch_norm_op SRCS batch_norm_op.cc DEPS ${op_DEPS})
lite_cc_library(feed_op SRCS feed_op.cc DEPS ${op_DEPS})
lite_cc_library(fetch_op SRCS fetch_op.cc DEPS ${op_DEPS})
lite_cc_library(io_copy_op SRCS io_copy_op.cc DEPS ${op_DEPS})
lite_cc_library(io_copy_once_op SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
lite_cc_library(activation_ops SRCS activation_ops.cc DEPS ${op_DEPS})
lite_cc_library(elementwise_ops SRCS elementwise_ops.cc DEPS ${op_DEPS})
lite_cc_library(lrn_op_lite SRCS lrn_op.cc DEPS ${op_DEPS})
lite_cc_library(decode_bboxes_op_lite SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
lite_cc_library(box_coder_op_lite SRCS box_coder_op.cc DEPS ${op_DEPS})
lite_cc_library(multiclass_nms_op_lite SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
lite_cc_library(fusion_elementwise_activation_ops SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS})
lite_cc_library(mean_op SRCS mean_op.cc DEPS ${op_DEPS})
lite_cc_library(fill_constant_op SRCS fill_constant_op.cc DEPS ${op_DEPS})
lite_cc_library(sgd_op SRCS sgd_op.cc DEPS ${op_DEPS})
lite_cc_library(uniform_random_op SRCS uniform_random_op.cc DEPS ${op_DEPS})
lite_cc_library(power_op SRCS power_op.cc DEPS ${op_DEPS})
lite_cc_library(shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${op_DEPS})
lite_cc_library(yolo_box_op SRCS yolo_box_op.cc DEPS ${op_DEPS})
lite_cc_library(interpolate_op SRCS interpolate_op.cc DEPS ${op_DEPS})
lite_cc_library(argmax_op SRCS argmax_op.cc DEPS ${op_DEPS})
lite_cc_library(axpy_op SRCS axpy_op.cc DEPS ${op_DEPS})
lite_cc_library(gru_unit_op SRCS gru_unit_op.cc DEPS ${op_DEPS})
lite_cc_library(gru_op SRCS gru_op.cc DEPS ${op_DEPS})
lite_cc_library(layout_op SRCS layout_op.cc DEPS ${op_DEPS})
lite_cc_library(layout_once_op SRCS layout_once_op.cc DEPS ${op_DEPS})
lite_cc_library(while_op SRCS while_op.cc DEPS ${op_DEPS})
lite_cc_library(lookup_table_op SRCS lookup_table_op.cc DEPS ${op_DEPS})
lite_cc_library(beam_search_decode_op SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
lite_cc_library(prior_box_op SRCS prior_box_op.cc DEPS ${op_DEPS})
lite_cc_library(density_prior_box_op SRCS density_prior_box_op.cc DEPS ${op_DEPS})
set(op_DEPS tensor op op_params scope memory)
lite_cc_library(op_params SRCS op_params.cc DEPS tensor any)
lite_cc_library(dropout_op SRCS dropout_op.cc DEPS ${op_DEPS})
lite_cc_library(concat_op SRCS concat_op.cc DEPS ${op_DEPS})
lite_cc_library(pad2d_op SRCS pad2d_op.cc DEPS ${op_DEPS})
lite_cc_library(negative_op SRCS negative_op.cc DEPS ${op_DEPS})
lite_cc_library(crop_op SRCS crop_op.cc DEPS ${op_DEPS})
lite_cc_library(calib_op SRCS calib_op.cc DEPS ${op_DEPS})
lite_cc_library(calib_once_op SRCS calib_once_op.cc DEPS ${op_DEPS})
lite_cc_library(split_op SRCS split_op.cc DEPS ${op_DEPS})
lite_cc_library(transpose_op SRCS transpose_op.cc DEPS ${op_DEPS})
lite_cc_library(fake_quant SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
lite_cc_library(fake_dequant SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS})
lite_cc_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS ${op_DEPS})
lite_cc_library(im2sequence_op SRCS im2sequence_op.cc DEPS ${op_DEPS})
lite_cc_library(sequence_softmax_op SRCS sequence_softmax_op.cc DEPS ${op_DEPS})
lite_cc_library(norm_op SRCS norm_op.cc DEPS ${op_DEPS})
lite_cc_library(graph_op SRCS graph_op.cc DEPS ${op_DEPS})
lite_cc_library(topk_op SRCS topk_op.cc DEPS ${op_DEPS})
lite_cc_library(increment_op SRCS increment_op.cc DEPS ${op_DEPS})
lite_cc_library(write_to_array_op SRCS write_to_array_op.cc DEPS ${op_DEPS})
lite_cc_library(graph_op_lite SRCS graph_op.cc DEPS ${op_DEPS})
lite_cc_library(logical_xor SRCS logical_op.cc DEPS ${op_DEPS})
lite_cc_library(logical_and SRCS logical_op.cc DEPS ${op_DEPS})
lite_cc_library(logical_or SRCS logical_op.cc DEPS ${op_DEPS})
lite_cc_library(logical_not SRCS logical_op.cc DEPS ${op_DEPS})
lite_cc_library(less_than SRCS compare_op.cc DEPS ${op_DEPS})
lite_cc_library(equal SRCS compare_op.cc DEPS ${op_DEPS})
lite_cc_library(not_equal SRCS compare_op.cc DEPS ${op_DEPS})
lite_cc_library(less_equal SRCS compare_op.cc DEPS ${op_DEPS})
lite_cc_library(greater_than SRCS compare_op.cc DEPS ${op_DEPS})
lite_cc_library(greater_equal SRCS compare_op.cc DEPS ${op_DEPS})
lite_cc_library(read_from_array_op SRCS read_from_array_op.cc DEPS ${op_DEPS})
lite_cc_library(beam_search_op SRCS beam_search_op.cc DEPS ${op_DEPS})
lite_cc_library(sequence_pool_op_lite SRCS sequence_pool_op.cc DEPS ${op_DEPS})
lite_cc_library(sequence_expand_op_lite SRCS sequence_expand_op.cc DEPS ${op_DEPS})
lite_cc_library(reduce_max_op_lite SRCS reduce_max_op.cc DEPS ${op_DEPS})
lite_cc_library(lod_reset_op SRCS lod_reset_op.cc DEPS ${op_DEPS})
lite_cc_library(is_empty SRCS is_empty_op.cc DEPS ${op_DEPS})
lite_cc_library(shape_op_lite SRCS shape_op.cc DEPS ${op_DEPS})
lite_cc_library(cast_op_lite SRCS cast_op.cc DEPS ${op_DEPS})
lite_cc_library(slice_op_lite SRCS slice_op.cc DEPS ${op_DEPS})
lite_cc_library(squeeze_op_lite SRCS squeeze_op.cc DEPS ${op_DEPS})
lite_cc_library(expand_op_lite SRCS expand_op.cc DEPS ${op_DEPS})
add_operator(conv_op basic SRCS conv_op.cc DEPS ${op_DEPS})
add_operator(pool_op basic SRCS pool_op.cc DEPS ${op_DEPS})
add_operator(fc_op basic SRCS fc_op.cc DEPS ${op_DEPS})
add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS})
add_operator(mul_op basic SRCS mul_op.cc DEPS ${op_DEPS})
add_operator(matmul_op basic SRCS matmul_op.cc DEPS ${op_DEPS})
add_operator(scale_op basic SRCS scale_op.cc DEPS ${op_DEPS})
add_operator(softmax_op basic SRCS softmax_op.cc DEPS ${op_DEPS})
add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} )
add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS})
add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS})
add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS})
add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS})
add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS})
add_operator(lrn_op_lite basic SRCS lrn_op.cc DEPS ${op_DEPS})
add_operator(decode_bboxes_op_lite basic SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS})
add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS})
add_operator(mean_op basic SRCS mean_op.cc DEPS ${op_DEPS})
add_operator(fill_constant_op basic SRCS fill_constant_op.cc DEPS ${op_DEPS})
#add_operator(sgd_op basic SRCS sgd_op.cc DEPS ${op_DEPS})
add_operator(uniform_random_op basic SRCS uniform_random_op.cc DEPS ${op_DEPS})
add_operator(power_op basic SRCS power_op.cc DEPS ${op_DEPS})
add_operator(shuffle_channel_op basic SRCS shuffle_channel_op.cc DEPS ${op_DEPS})
add_operator(yolo_box_op basic SRCS yolo_box_op.cc DEPS ${op_DEPS})
add_operator(interpolate_op basic SRCS interpolate_op.cc DEPS ${op_DEPS})
add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS})
add_operator(axpy_op basic SRCS axpy_op.cc DEPS ${op_DEPS})
add_operator(gru_unit_op basic SRCS gru_unit_op.cc DEPS ${op_DEPS})
add_operator(gru_op basic SRCS gru_op.cc DEPS ${op_DEPS})
add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
add_operator(layout_once_op basic SRCS layout_once_op.cc DEPS ${op_DEPS})
add_operator(prior_box_op basic SRCS prior_box_op.cc DEPS ${op_DEPS})
add_operator(density_prior_box_op basic SRCS density_prior_box_op.cc DEPS ${op_DEPS})
add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
add_operator(concat_op basic SRCS concat_op.cc DEPS ${op_DEPS})
add_operator(pad2d_op basic SRCS pad2d_op.cc DEPS ${op_DEPS})
add_operator(negative_op basic SRCS negative_op.cc DEPS ${op_DEPS})
add_operator(crop_op basic SRCS crop_op.cc DEPS ${op_DEPS})
add_operator(calib_op basic SRCS calib_op.cc DEPS ${op_DEPS})
add_operator(calib_once_op basic SRCS calib_once_op.cc DEPS ${op_DEPS})
add_operator(split_op basic SRCS split_op.cc DEPS ${op_DEPS})
add_operator(transpose_op basic SRCS transpose_op.cc DEPS ${op_DEPS})
add_operator(fake_quant basic SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
add_operator(fake_dequant basic SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS})
add_operator(conv_transpose_op basic SRCS conv_transpose_op.cc DEPS ${op_DEPS})
add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS})
add_operator(expand_op_lite basic SRCS expand_op.cc DEPS ${op_DEPS})
add_operator(reduce_max_op_lite basic SRCS reduce_max_op.cc DEPS ${op_DEPS})
add_operator(norm_op basic SRCS norm_op.cc DEPS ${op_DEPS})
add_operator(shape_op_lite basic SRCS shape_op.cc DEPS ${op_DEPS})
add_operator(sequence_expand_op_lite basic SRCS sequence_expand_op.cc DEPS ${op_DEPS})
add_operator(squeeze_op_lite basic SRCS squeeze_op.cc DEPS ${op_DEPS})
# for OCR specific
add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS})
add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
add_operator(graph_op_lite extra SRCS graph_op.cc DEPS ${op_DEPS})
add_operator(logical_xor extra SRCS logical_op.cc DEPS ${op_DEPS})
add_operator(logical_and extra SRCS logical_op.cc DEPS ${op_DEPS})
add_operator(logical_or extra SRCS logical_op.cc DEPS ${op_DEPS})
add_operator(logical_not extra SRCS logical_op.cc DEPS ${op_DEPS})
add_operator(less_than extra SRCS compare_op.cc DEPS ${op_DEPS})
add_operator(equal extra SRCS compare_op.cc DEPS ${op_DEPS})
add_operator(not_equal extra SRCS compare_op.cc DEPS ${op_DEPS})
add_operator(less_equal extra SRCS compare_op.cc DEPS ${op_DEPS})
add_operator(greater_than extra SRCS compare_op.cc DEPS ${op_DEPS})
add_operator(greater_equal extra SRCS compare_op.cc DEPS ${op_DEPS})
add_operator(read_from_array_op extra SRCS read_from_array_op.cc DEPS ${op_DEPS})
add_operator(beam_search_op extra SRCS beam_search_op.cc DEPS ${op_DEPS})
add_operator(sequence_pool_op_lite extra SRCS sequence_pool_op.cc DEPS ${op_DEPS})
add_operator(lod_reset_op extra SRCS lod_reset_op.cc DEPS ${op_DEPS})
add_operator(is_empty extra SRCS is_empty_op.cc DEPS ${op_DEPS})
add_operator(cast_op_lite extra SRCS cast_op.cc DEPS ${op_DEPS})
add_operator(slice_op_lite extra SRCS slice_op.cc DEPS ${op_DEPS})
add_operator(write_to_array_op extra SRCS write_to_array_op.cc DEPS ${op_DEPS})
add_operator(topk_op extra SRCS topk_op.cc DEPS ${op_DEPS})
add_operator(increment_op extra SRCS increment_op.cc DEPS ${op_DEPS})
add_operator(sequence_softmax_op extra SRCS sequence_softmax_op.cc DEPS ${op_DEPS})
set(ops
conv_op
pool_op
fc_op
relu_op
mul_op
matmul_op
scale_op
softmax_op
reshape_op
batch_norm_op
feed_op
fetch_op
gru_unit_op
gru_op
beam_search_decode_op
lookup_table_op
io_copy_op
io_copy_once_op
elementwise_ops
fusion_elementwise_activation_ops
lrn_op_lite
decode_bboxes_op_lite
multiclass_nms_op_lite
decode_bboxes_op_lite
box_coder_op_lite
multiclass_nms_op_lite
mean_op
fill_constant_op
activation_ops
dropout_op
concat_op
pad2d_op
crop_op
prior_box_op
density_prior_box_op
negative_op
calib_op
calib_once_op
split_op
transpose_op
fake_quant
fake_dequant
sgd_op
uniform_random_op
power_op
yolo_box_op
shuffle_channel_op
argmax_op
axpy_op
conv_transpose_op
im2sequence_op
sequence_softmax_op
norm_op
layout_op
layout_once_op
interpolate_op
logical_xor
logical_and
logical_or
logical_not
equal
not_equal
less_than
while_op
less_equal
greater_than
greater_equal
graph_op
topk_op
increment_op
write_to_array_op
read_from_array_op
beam_search_op
sequence_pool_op_lite
sequence_expand_op_lite
reduce_max_op_lite
lod_reset_op
is_empty
shape_op_lite
cast_op_lite
slice_op_lite
squeeze_op_lite
expand_op_lite
CACHE INTERNAL "ops lite")
if (NOT LITE_WITH_X86)
lite_cc_test(test_fc_op SRCS fc_op_test.cc
......@@ -184,7 +100,7 @@ if (NOT LITE_WITH_X86)
lite_cc_test(test_softmax_op SRCS softmax_op_test.cc DEPS softmax_op memory)
#lite_cc_test(test_reshape_op SRCS reshape_op_test.cc DEPS reshape_op memory)
lite_cc_test(test_batch_norm_op SRCS batch_norm_op_test.cc DEPS batch_norm_op memory)
lite_cc_test(test_concat_op SRCS concat_op_test.cc DEPS concat_op memory)
lite_cc_test(test_concat_op SRCS concat_op_test.cc DEPS concat_op memory scope)
lite_cc_test(test_calib_op SRCS calib_op_test.cc DEPS calib_op memory ARM_DEPS calib_compute_arm)
lite_cc_test(test_fusion_elementwise_activation_ops
SRCS fusion_elementwise_activation_ops_test.cc
......
......@@ -85,7 +85,9 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
}
}
}
param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
if (op_desc.HasAttr("fuse_relu")) {
param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
}
return true;
}
......
......@@ -41,15 +41,29 @@ bool DensityPriorBoxOpLite::AttachImpl(const cpp::OpDesc& opdesc,
param_.boxes = scope->FindVar(boxes)->GetMutable<lite::Tensor>();
param_.variances = scope->FindVar(variances)->GetMutable<lite::Tensor>();
param_.flip = opdesc.GetAttr<bool>("flip");
param_.clip = opdesc.GetAttr<bool>("clip");
param_.min_sizes = opdesc.GetAttr<std::vector<float>>("min_sizes");
param_.fixed_sizes = opdesc.GetAttr<std::vector<float>>("fixed_sizes");
param_.fixed_ratios = opdesc.GetAttr<std::vector<float>>("fixed_ratios");
param_.density_sizes = opdesc.GetAttr<std::vector<float>>("density_sizes");
param_.max_sizes = opdesc.GetAttr<std::vector<float>>("max_sizes");
param_.aspect_ratios = opdesc.GetAttr<std::vector<float>>("aspect_ratios");
param_.variances_ = opdesc.GetAttr<std::vector<float>>("variances");
if (opdesc.HasAttr("aspect_ratios")) {
param_.aspect_ratios = opdesc.GetAttr<std::vector<float>>("aspect_ratios");
}
if (opdesc.HasAttr("max_sizes")) {
param_.max_sizes = opdesc.GetAttr<std::vector<float>>("max_sizes");
}
if (opdesc.HasAttr("density_sizes")) {
param_.density_sizes = opdesc.GetAttr<std::vector<int>>("density_sizes");
}
if (opdesc.HasAttr("densities")) {
param_.density_sizes = opdesc.GetAttr<std::vector<int>>("densities");
}
if (opdesc.HasAttr("min_sizes")) {
param_.min_sizes = opdesc.GetAttr<std::vector<float>>("min_sizes");
}
if (opdesc.HasAttr("flip")) {
param_.flip = opdesc.GetAttr<bool>("flip");
}
if (opdesc.HasAttr("img_w")) {
param_.img_w = opdesc.GetAttr<int>("img_w");
}
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/fake_quantize_range_abs_max.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(fake_quantize_range_abs_max,
paddle::lite::operators::FakeQuantizeRangeMaxAbsOpLite);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "lite/core/kernel.h"
#include "lite/core/op_lite.h"
#include "lite/core/scope.h"
#include "lite/core/tensor.h"
#include "lite/operators/op_params.h"
#include "lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class FakeQuantizeRangeMaxAbsOpLite : public OpLite {
public:
FakeQuantizeRangeMaxAbsOpLite() {}
explicit FakeQuantizeRangeMaxAbsOpLite(const std::string &type)
: OpLite(type) {}
bool CheckShape() const override { return true; }
bool InferShape() const override { return true; }
bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
auto x = op_desc.Input("X").front();
auto in_scale = op_desc.Input("InScale").front();
auto out = op_desc.Output("Out").front();
auto out_scale = op_desc.Output("OutScale").front();
param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
param_.out_scale = scope->FindVar(out_scale)->GetMutable<lite::Tensor>();
param_.bit_length = op_desc.GetAttr<int>("bit_length");
return true;
}
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override {
return "fake_quantize_range_max_abs";
}
private:
mutable FakeQuantizeMovingAvgMaxAbsParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/flatten_op.h"
#include "lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool FlattenOp::CheckShape() const {
CHECK_OR_FALSE(param_.x);
CHECK_OR_FALSE(param_.output);
return true;
}
bool FlattenOp::InferShape() const {
auto x_dims = param_.x->dims();
auto out_lod = param_.output->mutable_lod();
*out_lod = param_.x->lod();
int64_t outer = 1, inner = 1;
for (int i = 0; i < x_dims.size(); ++i) {
if (i < axis_) {
outer *= x_dims[i];
} else {
inner *= x_dims[i];
}
}
std::vector<int64_t> out_shape(2);
out_shape[0] = outer;
out_shape[1] = inner;
param_.output->Resize(out_shape);
return true;
}
bool FlattenOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
auto x_var = scope->FindVar(opdesc.Input("X").front());
auto output_var = scope->FindVar(opdesc.Output("Out").front());
CHECK(x_var);
CHECK(output_var);
param_.x = const_cast<lite::Tensor *>(&(x_var->Get<lite::Tensor>()));
param_.output = output_var->GetMutable<lite::Tensor>();
axis_ = opdesc.GetAttr<int>("axis");
param_.inplace = false;
CHECK(param_.x) << "Input(X) of FlattenOp should not be null.";
CHECK(param_.output) << "Output(Out) of FlattenOp should not be null.";
CHECK_GE(axis_, 0) << "Flatten op axis should >=0.";
return true;
}
bool Flatten2Op::CheckShape() const {
FlattenOp::CheckShape();
CHECK_OR_FALSE(param_.xshape);
return true;
}
bool Flatten2Op::InferShape() const {
FlattenOp::InferShape();
auto x_dims = param_.x->dims();
std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 0);
for (size_t i = 0; i < x_dims.size(); i++) {
xshape_dims[i + 1] = x_dims[i];
}
param_.xshape->Resize(DDim(xshape_dims));
return true;
}
bool Flatten2Op::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
FlattenOp::AttachImpl(opdesc, scope);
auto xshape_var = scope->FindVar(opdesc.Output("XShape").front());
CHECK(xshape_var);
param_.xshape = xshape_var->GetMutable<lite::Tensor>();
CHECK(param_.xshape) << "Output(XShape) of FlattenOp should not be null.";
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(flatten, paddle::lite::operators::FlattenOp);
REGISTER_LITE_OP(flatten2, paddle::lite::operators::Flatten2Op);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/scope.h"
#include "lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class FlattenOp : public OpLite {
public:
FlattenOp() {}
explicit FlattenOp(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "flatten"; }
protected:
mutable ReshapeParam param_;
int axis_;
};
class Flatten2Op : public FlattenOp {
public:
Flatten2Op() : FlattenOp() {}
explicit Flatten2Op(const std::string &op_type) : FlattenOp(op_type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "flatten2"; }
};
} // namespace operators
} // namespace lite
} // namespace paddle
......@@ -521,7 +521,7 @@ struct PriorBoxParam {
struct DensityPriorBoxParam : public PriorBoxParam {
std::vector<float> fixed_sizes;
std::vector<float> fixed_ratios;
std::vector<float> density_sizes;
std::vector<int> density_sizes;
};
/// ----------------------- GRU operators ----------------------f
struct GRUParam {
......
......@@ -40,12 +40,14 @@ bool PriorBoxOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
param_.boxes = scope->FindVar(boxes)->GetMutable<lite::Tensor>();
param_.variances = scope->FindVar(variances)->GetMutable<lite::Tensor>();
param_.flip = opdesc.GetAttr<bool>("flip");
param_.clip = opdesc.GetAttr<bool>("clip");
param_.min_sizes = opdesc.GetAttr<std::vector<float>>("min_sizes");
param_.max_sizes = opdesc.GetAttr<std::vector<float>>("max_sizes");
param_.aspect_ratios = opdesc.GetAttr<std::vector<float>>("aspect_ratios");
param_.variances_ = opdesc.GetAttr<std::vector<float>>("variances");
if (opdesc.HasAttr("flip")) {
param_.flip = opdesc.GetAttr<bool>("flip");
}
if (opdesc.HasAttr("img_w")) {
param_.img_w = opdesc.GetAttr<int>("img_w");
}
......
......@@ -21,7 +21,13 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
#lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
#lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
#lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
if(LITE_BUILD_EXTRA)
lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
lite_cc_test(test_sgemm SRCS test_sgemm.cc DEPS ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
......@@ -31,9 +37,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
......
......@@ -171,9 +171,9 @@ void test_fc(Place place) {
DDim bdim{{bflag ? n : 0}};
std::unique_ptr<arena::TestCase> tester(
new FcOPTest(place, "def", dim_in, wdim, bdim, 1));
#ifdef WITH_ARM_LITE
#ifdef LITE_WITH_ARM
auto& ctx = tester->context()->As<ARMContext>();
ctx.SetRunMode(LITE_POWER_HIGH, 1);
ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1);
#endif
arena::Arena arena(std::move(tester), place, 6e-5);
if (!arena.TestPrecision()) {
......
......@@ -344,7 +344,7 @@ void test_gru_unit(Place place) {
place, "def", 1 /* sigomoid */, 2 /* tanh */, false, dims));
#ifdef LITE_WITH_ARM
auto& ctx = tester->context()->template As<ARMContext>();
ctx.SetRunMode(LITE_POWER_HIGH, 1);
ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1);
#endif
arena::Arena arena(std::move(tester), place, 2e-5);
arena.TestPrecision();
......
......@@ -75,7 +75,7 @@ void prior_box_compute_ref(const lite::Tensor* input,
const std::vector<float>& min_size_,
const std::vector<float>& fixed_size_,
const std::vector<float>& fixed_ratio_,
const std::vector<float>& density_size_,
const std::vector<int>& density_size_,
const std::vector<float>& max_size_,
const std::vector<float>& aspect_ratio_,
const std::vector<float>& variance_,
......@@ -352,7 +352,7 @@ class DensityPriorBoxComputeTester : public arena::TestCase {
std::vector<float> min_size_;
std::vector<float> fixed_size_;
std::vector<float> fixed_ratio_;
std::vector<float> density_size_;
std::vector<int> density_size_;
std::vector<float> max_size_;
std::vector<float> aspect_ratio_;
std::vector<float> variance_;
......@@ -375,7 +375,7 @@ class DensityPriorBoxComputeTester : public arena::TestCase {
const std::vector<float>& min_size,
const std::vector<float>& fixed_size,
const std::vector<float>& fixed_ratio,
const std::vector<float>& density_size,
const std::vector<int>& density_size,
const std::vector<float>& max_size,
const std::vector<float>& aspect_ratio,
const std::vector<float>& variance,
......@@ -561,7 +561,7 @@ class PriorBoxComputeTester : public arena::TestCase {
min_size_,
std::vector<float>(),
std::vector<float>(),
std::vector<float>(),
std::vector<int>(),
max_size_,
aspect_ratio_,
variance_,
......@@ -621,7 +621,7 @@ void test_density_prior_box(Place place) {
std::vector<float> variance{0.1f, 0.1f, 0.2f, 0.2f};
std::vector<float> fixed_size{60, 30};
std::vector<float> fixed_ratio{1., 2.};
std::vector<float> density_size{1., 3.};
std::vector<int> density_size{1, 3};
bool flip = true;
bool clip = false;
float step_h = 0;
......
......@@ -5,18 +5,22 @@ if [ $# -lt 2 ];
then
echo "Input error"
echo "USAGE:"
echo " sh benchmark.sh benchmark_bin_path test_models_dir"
echo " sh benchmark.sh benchmark_bin_path test_models_dir arm_bi"
echo " sh benchmark.sh benchmark_bin_path benchmark_models_path"
echo " sh benchmark.sh benchmark_bin_path benchmark_models_path is_run_model_optimize"
exit
fi
BENCHMARK_BIN=$1
MODELS_DIR=$2
ARM_BI=$3
ANDROID_DIR=/data/local/tmp
RESULT_FILENAME="result.txt"
WARMUP=10
REPEATS=30
BENCHMARK_BIN=$1
MODELS_DIR=$2
IS_RUN_MODEL_OPTIMIZE=false
if [ $# -gt 2 ];
then
IS_RUN_MODEL_OPTIMIZE=$3
fi
adb push $BENCHMARK_BIN $ANDROID_DIR/benchmark_bin
adb shell chmod 777 $ANDROID_DIR/benchmark_bin
......@@ -25,11 +29,11 @@ adb push $MODELS_DIR $ANDROID_DIR
adb shell "echo PaddleLite Benchmark > $ANDROID_DIR/$RESULT_FILENAME"
for threads in 1 2 4
do
adb shell "echo ABI=$ARM_BI Threads=$threads Warmup=$WARMUP Repeats=$REPEATS >> $ANDROID_DIR/$RESULT_FILENAME"
adb shell "echo Threads=$threads Warmup=$WARMUP Repeats=$REPEATS >> $ANDROID_DIR/$RESULT_FILENAME"
for model_name in `ls $MODELS_DIR`
do
echo $model_name
adb shell "$ANDROID_DIR/benchmark_bin --model_dir=$ANDROID_DIR/${MODELS_DIR##*/}/$model_name --warmup=$WARMUP --repeats=$REPEATS --threads=$threads --result_filename=$ANDROID_DIR/$RESULT_FILENAME"
adb shell "$ANDROID_DIR/benchmark_bin --model_dir=$ANDROID_DIR/${MODELS_DIR##*/}/$model_name --warmup=$WARMUP --repeats=$REPEATS --threads=$threads --result_filename=$ANDROID_DIR/$RESULT_FILENAME --run_model_optimize=$IS_RUN_MODEL_OPTIMIZE"
done
adb shell "echo >> $ANDROID_DIR/$RESULT_FILENAME"
done
......
#!/bin/bash
set -ex
readonly CMAKE_COMMON_OPTIONS="-DWITH_GPU=OFF \
-DWITH_MKL=OFF \
......@@ -31,6 +32,10 @@ function make_tiny_publish_so {
cur_dir=$(pwd)
build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
if [ -d $build_dir ]
then
rm -rf $build_dir
fi
mkdir -p $build_dir
cd $build_dir
......@@ -55,6 +60,10 @@ function make_full_publish_so {
cur_dir=$(pwd)
build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
if [ -d $build_dir ]
then
rm -rf $build_dir
fi
mkdir -p $build_dir
cd $build_dir
......@@ -78,6 +87,10 @@ function make_all_tests {
cur_dir=$(pwd)
build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
if [ -d $build_dir ]
then
rm -rf $build_dir
fi
mkdir -p $build_dir
cd $build_dir
......
#!/bin/bash
os=armlinux
abi=armv8
lang=gcc
if [ x$1 != x ]; then
abi=$1
fi
if [ x$2 != x ]; then
lang=$2
fi
cur_dir=$(pwd)
build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
mkdir -p $build_dir
cd $build_dir
GEN_CODE_PATH_PREFIX=lite/gen_code
mkdir -p ./${GEN_CODE_PATH_PREFIX}
touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
cmake .. \
-DWITH_GPU=OFF \
-DWITH_MKL=OFF \
-DWITH_LITE=ON \
-DLITE_WITH_CUDA=OFF \
-DLITE_WITH_X86=OFF \
-DLITE_WITH_ARM=ON \
-DWITH_ARM_DOTPROD=ON \
-DLITE_WITH_OPENMP=ON \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-DWITH_TESTING=ON \
-DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
make -j4 publish_inference
cd -
#!/bin/bash
set -e
build_dir=build.ios.armv7.arm64
mkdir -p ${build_dir}
......@@ -15,11 +16,15 @@ cmake .. \
-DLITE_WITH_CUDA=OFF \
-DLITE_WITH_X86=OFF \
-DLITE_WITH_ARM=ON \
-DLITE_WITH_OPENMP=ON \
-DWITH_TESTING=OFF \
-DLITE_WITH_JAVA=OFF \
-DLITE_SHUTDOWN_LOG=ON \
-DLITE_ON_TINY_PUBLISH=ON \
-DLITE_WITH_OPENMP=OFF \
-DWITH_ARM_DOTPROD=OFF \
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
-DWITH_TESTING=ON \
-DARM_TARGET_OS=ios
make -j2
make -j4
cd -
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import logging
ops_list_path = sys.argv[1]
dest_path = sys.argv[2]
out_lines = [
'#pragma once',
'#include "paddle_lite_factory_helper.h"',
'',
]
with open(ops_list_path) as f:
for line in f:
path = line.strip()
status = ''
with open(path) as g:
lines = [v for v in g]
for i in range(len(lines)):
line = lines[i].strip()
if not status:
key = 'REGISTER_LITE_KERNEL'
if line.startswith(key):
forward = i + min(7, len(lines) - i)
remaining = line[len(key) + 1:] + ' '.join(
[v.strip() for v in lines[i + 1:forward]])
x = remaining.find('.')
if x > 0:
remaining = remaining[:x]
fs = [v.strip() for v in remaining.split(',')]
assert (len(fs) >= 4)
op, target, precision, layout, __, alias = fs[:6]
alias = alias.replace(')', '')
key = "USE_LITE_KERNEL(%s, %s, %s, %s, %s);" % (
op, target, precision, layout, alias)
out_lines.append(key)
with open(dest_path, 'w') as f:
logging.info("write kernel list to %s" % dest_path)
f.write('\n'.join(out_lines))
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
''' Collect op registry information. '''
import sys
import logging
ops_list_path = sys.argv[1]
dest_path = sys.argv[2]
out_lines = [
'#pragma once',
'#include "paddle_lite_factory_helper.h"',
'',
]
with open(ops_list_path) as f:
for line in f:
path = line.strip()
with open(path) as g:
for line in g:
key = 'REGISTER_LITE_OP'
if line.startswith(key):
end = line.find(',')
op = line[len(key) + 1:end]
if not op: continue
if "_grad" in op: continue
out = "USE_LITE_OP(%s);" % op
out_lines.append(out)
with open(dest_path, 'w') as f:
logging.info("write op list to %s" % dest_path)
f.write('\n'.join(out_lines))
......@@ -115,7 +115,7 @@ void FillTensorData(lite::Tensor* tensor, const DebugConfig& conf, int col) {
data[i] = input_data[i];
}
} else {
LOG(INFO) << "------------> Use all-ones input";
LOG(INFO) << "-------------> Use all-ones input";
for (int i = 0; i < dim_size; i++) {
data[i] = 1;
}
......
......@@ -33,7 +33,7 @@ void Run(DebugConfig* conf) {
CHECK(conf);
#ifdef LITE_WITH_ARM
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, conf->arm_thread_num);
DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_HIGH, conf->arm_thread_num);
#endif
lite::Predictor predictor;
std::vector<Place> valid_places({
......
......@@ -35,7 +35,7 @@ static bool IsFileExists(const std::string& path) {
// ARM mobile not support mkdir in C++
static void MkDirRecur(const std::string& path) {
#ifndef LITE_WITH_ARM
if(system(string_format("mkdir -p %s", path.c_str()).c_str()) != 0) {
if (system(string_format("mkdir -p %s", path.c_str()).c_str()) != 0) {
LOG(ERROR) << "Cann't mkdir " << path;
}
#else // On ARM
......
......@@ -24,6 +24,7 @@
#include <cstdlib>
#include <cstring>
#include <string>
#include <assert.h>
#include "lite/utils/replace_stl/stream.h"
// NOLINTFILE()
......
---
Language: Cpp
BasedOnStyle: Google
Standard: Cpp11
...
Checks: >
*
-android-*
-bugprone-bool-pointer-implicit-conversion
-cert-env33-c
-cert-dcl50-cpp
-cert-dcl59-cpp
-cppcoreguidelines-*
-fuchsia-*
-google-*
google-default-arguments
google-explicit-constructor
google-runtime-member-string-references
google-runtime-operator
-hicpp-braces-around-statements
-hicpp-named-parameter
-hicpp-no-array-decay
-hicpp-no-assembler
-hicpp-no-malloc
-hicpp-function-size
-hicpp-special-member-functions
-hicpp-vararg
-llvm-*
-objc-*
-readability-else-after-return
-readability-implicit-bool-conversion
-readability-named-parameter
-readability-simplify-boolean-expr
-readability-braces-around-statements
-readability-identifier-naming
-readability-function-size
-readability-redundant-member-init
-misc-bool-pointer-implicit-conversion
-misc-definitions-in-headers
-misc-unused-alias-decls
-misc-unused-parameters
-misc-unused-using-decls
-modernize-use-using
-modernize-use-default-member-init
-clang-diagnostic-*
-clang-analyzer-*
WarningsAsErrors: '*'
HeaderFilterRegex: ''
AnalyzeTemporaryDtors: false
FormatStyle: none
User: allonli
CheckOptions:
- key: google-readability-braces-around-statements.ShortStatementLines
value: '1'
- key: google-readability-function-size.StatementThreshold
value: '800'
- key: google-readability-namespace-comments.ShortNamespaceLines
value: '10'
- key: google-readability-namespace-comments.SpacesBeforeComments
value: '2'
- key: modernize-loop-convert.MaxCopySize
value: '16'
- key: modernize-loop-convert.MinConfidence
value: reasonable
- key: modernize-loop-convert.NamingStyle
value: CamelCase
- key: modernize-pass-by-value.IncludeStyle
value: llvm
- key: modernize-replace-auto-ptr.IncludeStyle
value: llvm
- key: modernize-use-nullptr.NullMacros
value: 'NULL'
opencl_kernels.cpp
# Prerequisites
*.d
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.lib
*.a
# Executables
*.exe
*.out
*.app
.DS_Store
build/
.idea/
CMakeCache.txt
CMakeFiles/
Makefile
cmake_install.cmake
*.cbp
paddle-mobile.cbp
.idea
compile_commands.json
cmake-build-debug/
cmake-build-release/
test/models/
test/images/
# Emacs intermediate files
*~
# CMake building directory
build
# clion building directories
cmake-build-debug
cmake-build-release
# ios
tools/libomp.a
# ios demo
demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
*.xcuserstate
/tools/quantification/quantify
# metal
Podfile.lock
metal/Pods/
SwiftProtobuf.framework
paddle-mobile.xcworkspace
metal/models/
metal/images/
*.a
metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
*.xcuserdatad/
*/xcuserdata/
/venv/
metal/paddle-mobile-demo/paddle-mobile-demo/images
metal/paddle-mobile-demo/paddle-mobile-demo/models
metal/paddle-mobile-demo/paddle-mobile-demo/Resources
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
metal/MobileNetDemo/MobileNetDemo/Resources
repos:
- repo: https://github.com/Lucas-C/pre-commit-hooks.git
sha: v1.0.1
hooks:
- id: remove-crlf
files: ^(mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
exclude: ^(lite/)
- id: remove-tabs
files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
exclude: ^(lite/)
- repo: https://github.com/pre-commit/pre-commit-hooks
sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
hooks:
- id: check-added-large-files
exclude: ^(lite/)
- id: check-merge-conflict
exclude: ^(lite/)
- id: check-symlinks
exclude: ^(lite/)
- id: detect-private-key
files: (?!.*tar.gz)^.*$
exclude: ^(lite/)
- id: end-of-file-fixer
files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|h|hpp|hxx)$
exclude: ^(lite/)
- id: trailing-whitespace
files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|h|hpp|hxx)$
exclude: ^(lite/)
- repo: local
hooks:
- id: copyright
name: copyright
entry: python ./mobile/tools/pre-commit.hooks/copyright.hook
language: system
files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx|py)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ | ^(lite/)
- repo: local
hooks:
- id: clang-format
name: clang-format
description: Format files with ClangFormat.
entry: bash ./mobile/tools/pre-commit.hooks/clang-format.hook -i
language: system
files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
exclude: ^(lite/)
- repo: local
hooks:
- id: cpplint
name: cpplint
description: Check C++ code style using cpplint.
entry: bash ./mobile/tools/pre-commit.hooks/cpplint.hook
language: system
files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$i | *\.pb\.cpp | ^(lite/)
#
#- repo: local
# hooks:
# - id: clang-tidy
# name: clang-tidy
# description: Check C++ code style using clang-tidy.
# entry: bash ./tools/pre-commit.hooks/.clang-tidy.hook -i
# language: system
# files: (src).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
language: cpp
cache: ccache
sudo: required
dist: trusty
os:
- linux
addons:
apt:
packages:
- git
- python
- python-pip
- python2.7-dev
- libc6-i386
- curl
compiler:
- clang
before_install:
- sudo pip install -U virtualenv pre-commit pip
# Download and install recent cmake
script:
- |
function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
- |
timeout 600 .travis/pre-commit-job.sh # 10min timeout
RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
notifications:
email:
on_success: change
on_failure: always
#!/bin/bash
function abort(){
echo "Your change doesn't follow Paddle-Moible's code style" 1>&2
echo "Please use pre-commit to auto-format your code." 1>&2
exit 1
}
trap 'abort' 0
set -e
cd `dirname $0`
cd ..
export PATH=/usr/bin:$PATH
pre-commit install
if ! pre-commit run -a ; then
ls -lh
git diff --exit-code
exit 1
fi
trap : 0
......@@ -96,6 +96,21 @@ class CLEngine {
return std::move(program_ptr);
}
std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWithSource(
cl_context context, const char *source) {
size_t sourceSize[] = {strlen(source)};
cl_program p =
clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
DLOG << " cl kernel from source";
DLOG << " source size: " << sourceSize[0];
CL_CHECK_ERRORS(status_);
std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
return std::move(program_ptr);
}
std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
cl_event event = clCreateUserEvent(context, &status_);
std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);
......
......@@ -14,9 +14,11 @@ limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "CL/cl.h"
#include "framework/cl/cl_deleter.h"
......@@ -24,6 +26,10 @@ limitations under the License. */
#include "framework/cl/cl_tool.h"
namespace paddle_mobile {
extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels;
extern const std::vector<std::string> need_conv_header_kernels;
namespace framework {
class CLScope {
......@@ -62,15 +68,35 @@ class CLScope {
return it->second.get();
}
auto program = CLEngine::Instance()->CreateProgramWith(
context_,
CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
DLOG << " --- begin build program -> " << program_key << " --- ";
CLEngine::Instance()->BuildProgram(program.get(), options);
DLOG << " --- end build program -> " << program_key << " --- ";
programs_[program_key] = std::move(program);
if (opencl_kernels.find(file_name) != opencl_kernels.end()) {
auto it = opencl_kernels.find(file_name);
std::string source(it->second.begin(), it->second.end());
if (std::find(need_conv_header_kernels.begin(),
need_conv_header_kernels.end(),
file_name) != need_conv_header_kernels.end()) {
auto it = opencl_kernels.find("conv_kernel.inc.cl");
std::string header(it->second.begin(), it->second.end());
source = header + source;
}
auto program = CLEngine::Instance()->CreateProgramWithSource(
context_, source.c_str());
DLOG << " --- begin build program -> " << program_key << " --- ";
CLEngine::Instance()->BuildProgram(program.get(), options);
DLOG << " --- end build program -> " << program_key << " --- ";
programs_[program_key] = std::move(program);
} else {
auto program = CLEngine::Instance()->CreateProgramWith(
context_,
CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
DLOG << " --- begin build program -> " << program_key << " --- ";
CLEngine::Instance()->BuildProgram(program.get(), options);
DLOG << " --- end build program -> " << program_key << " --- ";
programs_[program_key] = std::move(program);
}
return programs_[program_key].get();
}
......
......@@ -16,9 +16,9 @@ limitations under the License. */
#include <cstdint>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <memory>
#include <utility>
#include <vector>
......
......@@ -14,10 +14,10 @@ limitations under the License. */
#ifdef CONDITIONAL_BLOCK_OP
#include <algorithm>
#include "operators/kernel/conditional_block_kernel.h"
#include <framework/program/block_desc.h>
#include <framework/program/op_desc.h>
#include <algorithm>
#include "framework/data_type.h"
namespace paddle_mobile {
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import os
import sys
source = """
#pragma
#ifdef PADDLE_MOBILE_CL
#include <map>
#include <string>
#include <vector>
namespace paddle_mobile {
extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels = {
%s
};
extern const std::vector<std::string> need_conv_header_kernels = {
%s
};
}
#endif
"""
def string_to_hex(str):
hex_list = []
for i in range(len(code_str)):
hex_ = hex(ord(code_str[i]))
hex_list.append(hex_)
return hex_list
infile = open("cl_kernel/cl_common.h", "r")
common_content = infile.read()
infile.close()
common_content = re.sub(r"/\*[^*]*\*/", "", common_content, flags=re.DOTALL)
lines = common_content.split("\n")
new_lines = []
for i in range(len(lines)):
line = lines[i]
line = line.strip()
if line == "":
continue
if line.startswith("//"):
continue
line = re.sub(r"//.*$", "", line)
new_lines.append(line)
common_content = "\n".join(new_lines)
need_conv_header_kernels = []
cores = ""
filenames = os.listdir("cl_kernel")
file_count = len(filenames)
for i in range(file_count):
filename = filenames[i]
infile = open("cl_kernel/" + filename, "r")
new_lines = []
content = infile.read()
content = re.sub(r"/\*[^*]*\*/", "", content, flags=re.DOTALL)
infile.close()
lines = content.split("\n")
for i in range(len(lines)):
line = lines[i]
line = line.strip()
if line == "":
continue
if line.startswith("//"):
continue
line = re.sub(r"//.*$", "", line)
if "cl_common.h" in line:
line = common_content
elif "conv_kernel.inc.cl" in line:
need_conv_header_kernels.append("\"%s\"" % filename)
continue
new_lines.append(line)
content = "\n".join(new_lines)
if content == "":
content = " "
hexes = []
for char in content:
hexes.append(hex(ord(char)))
core = " {\"%s\", {" % filename
for item in hexes:
core += str(item) + ", "
core = core[: -2]
core += "}}"
if i != file_count - 1:
core += ",\n"
cores += core
source = source % (cores, ",".join(need_conv_header_kernels))
print(source)
此差异已折叠。
......@@ -2,6 +2,15 @@
NETS=""
declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet" "super" "op")
# merge cl to so
merge_cl_to_so=1
rm ../src/operators/kernel/cl/opencl_kernels.cpp
if [ $merge_cl_to_so == 1 ]; then
cd ../src/operators/kernel/cl
python gen_code.py > opencl_kernels.cpp
cd -
fi
build_for_mac() {
if [ ! `which brew` ]; then
echo "building failed! homebrew not found, please install homebrew."
......
#!/bin/bash
set -e
# set -e
readonly VERSION="5.0"
......
......@@ -535,6 +535,7 @@ def main():
push(checked_model_path)
push(feed_path + "/" + last_feed_file_name, "input.txt")
push(mobile_src_root + "/build/release/arm-v7a/build/libpaddle-mobile.so")
push(mobile_src_root + "/build/release/arm-v7a/build/cl_kernel")
push(mobile_src_root + "/test/build/test-net")
last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
args = str(len(last_feed_var_shape))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册