提交 34491d6a 编写于 作者: H hong19860320

Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into...

Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into hongming/refine_arm_context
...@@ -42,7 +42,17 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) ...@@ -42,7 +42,17 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
message(FATAL_ERROR "ARM_TARGET_ARCH_ABI must be in one of ${ARM_TARGET_ARCH_ABI_LIST}") message(FATAL_ERROR "ARM_TARGET_ARCH_ABI must be in one of ${ARM_TARGET_ARCH_ABI_LIST}")
endif() endif()
message(STATUS "Lite ARM Compile ${ARM_TARGET_OS} with ${ARM_TARGET_ARCH_ABI}") # check arch abi
if(NOT DEFINED ARM_TARGET_LANG)
set(ARM_TARGET_LANG "gcc" CACHE STRING "Choose ARM Target Language")
endif()
set(ARM_TARGET_LANG_LIST "gcc" "clang" "")
set_property(CACHE ARM_TARGET_LANG PROPERTY STRINGS ${ARM_TARGET_LANG_LIST})
if (NOT ARM_TARGET_LANG IN_LIST ARM_TARGET_LANG_LIST)
message(FATAL_ERROR "ARM_TARGET_LANG must be in one of ${ARM_TARGET_LANG_LIST}")
endif()
message(STATUS "Lite ARM Compile ${ARM_TARGET_OS} with ${ARM_TARGET_ARCH_ABI} ${ARM_TARGET_LANG}")
include(cross_compiling/host) include(cross_compiling/host)
include(cross_compiling/armlinux) include(cross_compiling/armlinux)
include(cross_compiling/android) include(cross_compiling/android)
...@@ -158,6 +168,9 @@ include_directories("${PADDLE_SOURCE_DIR}") ...@@ -158,6 +168,9 @@ include_directories("${PADDLE_SOURCE_DIR}")
# for mobile # for mobile
if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
message(STATUS "Building the mobile framework") message(STATUS "Building the mobile framework")
if (ANDROID)
include(cross_compiling/findar)
endif()
# include the necessary thirdparty dependencies # include the necessary thirdparty dependencies
include(external/gflags) # download, build, install gflags include(external/gflags) # download, build, install gflags
include(external/glog) # download, build, install glog include(external/glog) # download, build, install glog
......
...@@ -31,7 +31,7 @@ if(NOT DEFINED ANDROID_API_LEVEL) ...@@ -31,7 +31,7 @@ if(NOT DEFINED ANDROID_API_LEVEL)
endif() endif()
if(NOT DEFINED ANDROID_STL_TYPE) if(NOT DEFINED ANDROID_STL_TYPE)
set(ANDROID_STL_TYPE "c++_static" CACHE STRING "stl type") set(ANDROID_STL_TYPE "c++_static" CACHE STRING "stl type") # can also use shared
endif() endif()
if(ARM_TARGET_ARCH_ABI STREQUAL "armv7hf") if(ARM_TARGET_ARCH_ABI STREQUAL "armv7hf")
...@@ -71,8 +71,31 @@ if (NOT ANDROID_STL_TYPE IN_LIST ANDROID_STL_TYPE_LITS) ...@@ -71,8 +71,31 @@ if (NOT ANDROID_STL_TYPE IN_LIST ANDROID_STL_TYPE_LITS)
message(FATAL_ERROR "ANDROID_STL_TYPE must be in one of ${ANDROID_STL_TYPE_LITS}") message(FATAL_ERROR "ANDROID_STL_TYPE must be in one of ${ANDROID_STL_TYPE_LITS}")
endif() endif()
if(ARM_TARGET_LANG STREQUAL "gcc")
# gcc do not need set lang
set(ARM_TARGET_LANG "")
endif()
set(CMAKE_SYSTEM_NAME Android) set(CMAKE_SYSTEM_NAME Android)
set(CMAKE_SYSTEM_VERSION ${ANDROID_API_LEVEL}) set(CMAKE_SYSTEM_VERSION ${ANDROID_API_LEVEL})
set(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ARCH_ABI}) set(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ARCH_ABI})
set(CMAKE_ANDROID_NDK ${ANDROID_NDK}) set(CMAKE_ANDROID_NDK ${ANDROID_NDK})
set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION ${ARM_TARGET_LANG})
set(CMAKE_ANDROID_STL_TYPE ${ANDROID_STL_TYPE}) set(CMAKE_ANDROID_STL_TYPE ${ANDROID_STL_TYPE})
if (ARM_TARGET_LANG STREQUAL "clang")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
set(triple aarch64-v8a-linux-android)
elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(triple arm-v7a-linux-android)
else()
message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
endif()
set(CMAKE_C_COMPILER clang)
set(CMAKE_C_COMPILER_TARGET ${triple})
set(CMAKE_CXX_COMPILER clang++)
set(CMAKE_CXX_COMPILER_TARGET ${triple})
message(STATUS "CMAKE_CXX_COMPILER_TARGET: ${CMAKE_CXX_COMPILER_TARGET}")
endif()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT ARM_TARGET_LANG STREQUAL "clang")
# only clang need find ar tool
return()
endif()
if(NOT EXISTS "${CMAKE_CXX_COMPILER}")
message(ERROR "Can not find CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}")
endif()
get_filename_component(AR_PATH ${CMAKE_CXX_COMPILER} PATH)
find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH})
if(NOT AR_TOOL)
message(ERROR "Failed to find AR_TOOL in ${AR_PATH}")
else()
set(CMAKE_AR ${AR_TOOL})
message(STATUS "Found CMAKE_AR : " ${CMAKE_AR})
endif()
...@@ -40,7 +40,8 @@ if(ANDROID) ...@@ -40,7 +40,8 @@ if(ANDROID)
"-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}" "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
"-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}" "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
"-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}" "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
"-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}") "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
"-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}" )
endif() endif()
ExternalProject_Add( ExternalProject_Add(
......
...@@ -46,7 +46,8 @@ if(ANDROID) ...@@ -46,7 +46,8 @@ if(ANDROID)
"-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}" "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
"-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}" "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
"-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}" "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
"-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}") "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
"-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}")
endif() endif()
ExternalProject_Add( ExternalProject_Add(
......
...@@ -58,7 +58,9 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) ...@@ -58,7 +58,9 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
"-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}" "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
"-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}" "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
"-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}" "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
"-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}") "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
"-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}"
)
endif() endif()
ExternalProject_Add( ExternalProject_Add(
......
...@@ -199,6 +199,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -199,6 +199,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
"-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}" "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
"-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}" "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
"-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}" "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
"-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}"
"-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
"-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
"-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
......
...@@ -5,7 +5,7 @@ cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_p ...@@ -5,7 +5,7 @@ cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_p
cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass) cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass)
cc_library(analysis_passes SRCS passes.cc DEPS cc_library(analysis_passes SRCS use_passes.cc DEPS
ir_graph_build_pass ir_graph_build_pass
ir_analysis_pass ir_analysis_pass
ir_params_sync_among_devices_pass ir_params_sync_among_devices_pass
......
...@@ -12,13 +12,13 @@ ...@@ -12,13 +12,13 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h" #include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/analysis/passes/passes.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -10,6 +10,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") ...@@ -10,6 +10,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install") set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
set(LITE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url") set(LITE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
function(lite_download_and_uncompress INSTALL_DIR URL FILENAME) function(lite_download_and_uncompress INSTALL_DIR URL FILENAME)
...@@ -182,3 +185,11 @@ add_subdirectory(model_parser) ...@@ -182,3 +185,11 @@ add_subdirectory(model_parser)
add_subdirectory(utils) add_subdirectory(utils)
add_subdirectory(api) add_subdirectory(api)
add_subdirectory(gen_code) add_subdirectory(gen_code)
if (WITH_TESTING)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
endif()
endif()
set(cxx_api_lite_deps scope_lite optimizer_lite target_wrapper_host model_parser_lite) set(cxx_api_lite_deps
scope_lite optimizer_lite target_wrapper_host model_parser_lite program_lite)
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
set(cxx_api_lite_deps ${cxx_api_lite_deps} kernels_cuda) set(cxx_api_lite_deps ${cxx_api_lite_deps} kernels_cuda)
cc_library(cxx_api_lite_cuda SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} target_wrapper_cuda) cc_library(cxx_api_lite_cuda SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} target_wrapper_cuda)
nv_test(test_cxx_api_lite_cuda SRCS cxx_api_test.cc DEPS cxx_api_lite_cuda) nv_test(test_cxx_api_lite_cuda SRCS cxx_api_test.cc DEPS cxx_api_lite_cuda)
endif() endif()
cc_library(cxx_api_lite SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} ${ops_lite} program_lite) lite_cc_library(lite_api_test_helper SRCS lite_api_test_helper.cc
DEPS scope_lite optimizer_lite target_wrapper_host model_parser_lite program_lite
${ops_lite} ${host_kernels}
CUDA_DEPS kernels_cuda
X86_DEPS ${x86_kernels}
)
lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS lite_api_test_helper)
set(light_api_deps set(light_api_deps
scope_lite target_wrapper_host model_parser_lite) scope_lite target_wrapper_host model_parser_lite program_lite)
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
set(light_api_deps ${light_api_deps} target_wrapper_cuda) set(light_api_deps ${light_api_deps} target_wrapper_cuda)
endif() endif()
#cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels}) lite_cc_library(light_api_lite SRCS light_api.cc
DEPS ${light_api_deps} ${ops_lite} ${host_kernels}
)
message(STATUS "get ops ${ops_lite}") message(STATUS "get ops ${ops_lite}")
message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get Host kernels ${host_kernels}")
...@@ -24,24 +33,41 @@ include(ExternalProject) ...@@ -24,24 +33,41 @@ include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.") "A path setting inference demo download directories.")
if((NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) AND WITH_TESTING) if(WITH_TESTING)
set(eval_model_dir "")
set(test_cxx_api_deps cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${x86_kernels})
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
set(eval_model_dir ${LITE_MODEL_DIR}/mobilenet_v2_relu)
set(test_cxx_api_deps ${test_cxx_api_deps} ${arm_kernels})
endif()
lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
DEPS cxx_api_lite mir_passes DEPS ${test_cxx_api_deps}
${ops_lite} ${host_kernels} ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt
--eval_model_dir=eval_model_dir SERIAL)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz) add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
add_dependencies(test_cxx_api_lite extern_lite_download_mobilenet_v2_relu_tar_gz)
endif()
endif() endif()
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) # These tests needs CLI arguments, and is not supported in ARM CI.
add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz) # TODO(Superjomn) support latter.
endif() if(NOT LITE_ON_MOBILE)
lite_cc_test(test_light_api SRCS light_api_test.cc
DEPS light_api_lite mir_passes
X86_DEPS ${x86_kernels}
ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt
SERIAL)
# if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) lite_cc_test(test_apis_lite SRCS apis_test.cc
# lite_cc_test(test_light_api SRCS light_api_test.cc DEPS light_api_lite ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) DEPS cxx_api_lite light_api_lite ${ops_lite} mir_passes
# endif() X86_DEPS ${x86_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
endif()
lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
DEPS DEPS
...@@ -51,4 +77,3 @@ lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc ...@@ -51,4 +77,3 @@ lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
mir_passes mir_passes
${ops_lite} ${host_kernels} ${ops_lite} ${host_kernels}
ARM_DEPS ${arm_kernels}) ARM_DEPS ${arm_kernels})
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*
* We test multiple apis here.
*/
#include <gtest/gtest.h>
#include <sstream>
#include <vector>
#include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/api/light_api.h"
#include "paddle/fluid/lite/core/mir/pass_registry.h"
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/kernels/use_kernels.h"
#include "paddle/fluid/lite/operators/use_ops.h"
DEFINE_string(model_dir, "", "");
DEFINE_string(optimized_model, "", "");
namespace paddle {
namespace lite {
void SetConstInput(lite::Tensor* x) {
x->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
auto* data = x->mutable_data<float>();
for (int i = 0; i < 100 * 100; i++) {
data[i] = i;
}
}
bool CompareTensors(const std::string& name, const ExecutorLite& cxx_api,
const LightPredictor& light_api) {
const auto* a = cxx_api.GetTensor(name);
const auto* b = light_api.GetTensor(name);
return TensorCompareWith(*a, *b);
}
#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
TEST(CXXApi_LightApi, save_and_load_model) {
lite::ExecutorLite cxx_api;
lite::LightPredictor light_api;
// CXXAPi
{
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
cxx_api.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},
valid_places);
auto* x = cxx_api.GetInput(0);
SetConstInput(x);
cxx_api.Run();
LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
cxx_api.SaveModel(FLAGS_optimized_model);
}
// LightApi
{
light_api.Build(FLAGS_optimized_model);
auto* x = light_api.GetInput(0);
SetConstInput(x);
light_api.Run();
}
const auto* cxx_out = cxx_api.GetOutput(0);
const auto* light_out = light_api.GetOutput(0);
ASSERT_TRUE(TensorCompareWith(*cxx_out, *light_out));
std::vector<std::string> tensors_with_order({
"a", "fc_0.w_0", "scale_0.tmp_0",
});
for (const auto& tensor_name : tensors_with_order) {
ASSERT_TRUE(CompareTensors(tensor_name, cxx_api, light_api));
}
}
#endif // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
} // namespace lite
} // namespace paddle
...@@ -78,6 +78,11 @@ class ExecutorLite { ...@@ -78,6 +78,11 @@ class ExecutorLite {
return &fetch_list.at(offset); return &fetch_list.at(offset);
} }
const lite::Tensor* GetTensor(const std::string& name) const {
auto* var = program_->exec_scope()->FindVar(name);
return &var->Get<lite::Tensor>();
}
void Run() { program_->Run(); } void Run() { program_->Run(); }
const framework::proto::ProgramDesc& program_desc() const { const framework::proto::ProgramDesc& program_desc() const {
......
...@@ -14,8 +14,9 @@ ...@@ -14,8 +14,9 @@
#include "paddle/fluid/lite/api/cxx_api.h" #include "paddle/fluid/lite/api/cxx_api.h"
#include <chrono> // NOLINT #include <chrono> // NOLINT
#include "paddle/fluid/lite/core/mir/passes.h" #include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/core/op_registry.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
......
...@@ -16,59 +16,34 @@ ...@@ -16,59 +16,34 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <vector> #include <vector>
#include "paddle/fluid/lite/core/mir/passes.h" #include "paddle/fluid/lite/api/lite_api_test_helper.h"
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/kernels/use_kernels.h"
DEFINE_string(model_dir, "", ""); #include "paddle/fluid/lite/operators/use_ops.h"
DEFINE_string(optimized_model, "", "");
// For training. // For training.
DEFINE_string(startup_program_path, "", ""); DEFINE_string(startup_program_path, "", "");
DEFINE_string(main_program_path, "", ""); DEFINE_string(main_program_path, "", "");
// for eval
DEFINE_string(eval_model_dir, "", "");
namespace paddle { namespace paddle {
namespace lite { namespace lite {
#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
TEST(CXXApi, test) { TEST(CXXApi, test) {
lite::ExecutorLite predictor; const lite::Tensor* out = RunHvyModel();
#ifndef LITE_WITH_CUDA
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
#else
std::vector<Place> valid_places({
Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
});
#endif
predictor.Build(FLAGS_model_dir,
Place{TARGET(kX86), PRECISION(kFloat)}, // origin cuda
valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < 100 * 100; i++) {
data[i] = i;
}
// LOG(INFO) << "input " << *input_tensor;
predictor.Run();
auto* out = predictor.GetOutput(0);
LOG(INFO) << out << " memory size " << out->data_size(); LOG(INFO) << out << " memory size " << out->data_size();
LOG(INFO) << "out " << out->data<float>()[0]; for (int i = 0; i < 10; i++) {
LOG(INFO) << "out " << out->data<float>()[1]; LOG(INFO) << "out " << out->data<float>()[i];
}
LOG(INFO) << "dims " << out->dims(); LOG(INFO) << "dims " << out->dims();
// LOG(INFO) << "out " << *out; // LOG(INFO) << "out " << *out;
} }
#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
TEST(CXXApi, save_model) { TEST(CXXApi, save_model) {
lite::ExecutorLite predictor; lite::ExecutorLite predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)}, std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
...@@ -79,9 +54,7 @@ TEST(CXXApi, save_model) { ...@@ -79,9 +54,7 @@ TEST(CXXApi, save_model) {
LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model; LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
predictor.SaveModel(FLAGS_optimized_model); predictor.SaveModel(FLAGS_optimized_model);
} }
#endif // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
/*TEST(CXXTrainer, train) { /*TEST(CXXTrainer, train) {
Place prefer_place({TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)}); Place prefer_place({TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)});
std::vector<Place> valid_places({prefer_place}); std::vector<Place> valid_places({prefer_place});
...@@ -115,46 +88,37 @@ TEST(CXXApi, save_model) { ...@@ -115,46 +88,37 @@ TEST(CXXApi, save_model) {
}*/ }*/
#endif // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK #endif // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
} // namespace lite #ifdef LITE_WITH_ARM
} // namespace paddle TEST(CXXApi, eval) {
DeviceInfo::Init();
lite::ExecutorLite predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kARM), PRECISION(kFloat)}});
USE_LITE_OP(mul); predictor.Build(FLAGS_eval_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
USE_LITE_OP(fc); valid_places);
USE_LITE_OP(relu);
USE_LITE_OP(scale); auto* input_tensor = predictor.GetInput(0);
USE_LITE_OP(feed); input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
USE_LITE_OP(fetch); auto* data = input_tensor->mutable_data<float>();
USE_LITE_OP(io_copy); for (int i = 0; i < input_tensor->dims().production(); i++) {
USE_LITE_OP(elementwise_add) data[i] = 1;
USE_LITE_OP(elementwise_sub) }
USE_LITE_OP(square)
USE_LITE_OP(softmax)
USE_LITE_OP(dropout)
USE_LITE_OP(concat)
USE_LITE_OP(conv2d)
USE_LITE_OP(depthwise_conv2d)
USE_LITE_OP(pool2d)
USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
#ifdef LITE_WITH_X86
USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
#endif
#ifdef LITE_WITH_CUDA predictor.Run();
USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device); auto* out = predictor.GetOutput(0);
USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host); std::vector<float> results({0.00097802, 0.00099822, 0.00103093, 0.00100121,
0.00098268, 0.00104065, 0.00099962, 0.00095181,
0.00099694, 0.00099406});
for (int i = 0; i < results.size(); ++i) {
EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
}
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
}
#endif #endif
} // namespace lite
} // namespace paddle
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include <string> #include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/core/context.h" #include "paddle/fluid/lite/core/context.h"
#include "paddle/fluid/lite/core/program.h" #include "paddle/fluid/lite/core/program.h"
#include "paddle/fluid/lite/core/types.h" #include "paddle/fluid/lite/core/types.h"
...@@ -62,6 +63,11 @@ class LightPredictor { ...@@ -62,6 +63,11 @@ class LightPredictor {
return &fetch_list.at(offset); return &fetch_list.at(offset);
} }
const lite::Tensor* GetTensor(const std::string& name) const {
auto* var = program_->exec_scope()->FindVar(name);
return &var->Get<lite::Tensor>();
}
private: private:
void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog) { void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog) {
std::vector<Instruction> insts; std::vector<Instruction> insts;
...@@ -72,9 +78,8 @@ class LightPredictor { ...@@ -72,9 +78,8 @@ class LightPredictor {
// Create the kernels of the target places, and filter out the specific // Create the kernels of the target places, and filter out the specific
// kernel with the target alias. // kernel with the target alias.
for (auto& op : program.ops_) { for (auto& op : program.ops()) {
lite::pb::OpDesc desc(op->op_info()->desc()); auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
auto kernel_type = desc.GetAttr(kKernelTypeAttr).get<std::string>();
std::string op_type, alias; std::string op_type, alias;
Place place; Place place;
KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place); KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
...@@ -89,8 +94,8 @@ class LightPredictor { ...@@ -89,8 +94,8 @@ class LightPredictor {
insts.emplace_back(op, std::move(*it)); insts.emplace_back(op, std::move(*it));
} }
program_.reset(new RuntimeProgram(std::move(insts))); program_.reset(new RuntimeProgram(std::move(insts)));
CHECK(program.exec_scope_); CHECK(program.exec_scope());
program_->set_exec_scope(program.exec_scope_); program_->set_exec_scope(program.exec_scope());
} }
private: private:
......
...@@ -15,6 +15,9 @@ ...@@ -15,6 +15,9 @@
#include "paddle/fluid/lite/api/light_api.h" #include "paddle/fluid/lite/api/light_api.h"
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/kernels/use_kernels.h"
#include "paddle/fluid/lite/operators/use_ops.h"
DEFINE_string(optimized_model, "", ""); DEFINE_string(optimized_model, "", "");
...@@ -33,29 +36,14 @@ TEST(LightAPI, load) { ...@@ -33,29 +36,14 @@ TEST(LightAPI, load) {
} }
predictor.Run(); predictor.Run();
const auto* output = predictor.GetOutput(0);
const float* raw_output = output->data<float>();
for (int i = 0; i < 10; i++) {
LOG(INFO) << "out " << raw_output[i];
}
} }
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
USE_LITE_OP(mul);
USE_LITE_OP(fc);
USE_LITE_OP(scale);
USE_LITE_OP(feed);
USE_LITE_OP(fetch);
USE_LITE_OP(io_copy);
USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
#ifdef LITE_WITH_X86
USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
#endif
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/api/lite_api_test_helper.h"
#include <vector>
DEFINE_string(model_dir, "", "");
DEFINE_string(optimized_model, "", "");
namespace paddle {
namespace lite {
const lite::Tensor* RunHvyModel() {
lite::ExecutorLite predictor;
#ifndef LITE_WITH_CUDA
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
#else
std::vector<Place> valid_places({
Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
});
#endif
predictor.Build(FLAGS_model_dir,
Place{TARGET(kX86), PRECISION(kFloat)}, // origin cuda
valid_places);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
auto* data = input_tensor->mutable_data<float>();
for (int i = 0; i < 100 * 100; i++) {
data[i] = i;
}
// LOG(INFO) << "input " << *input_tensor;
predictor.Run();
const auto* out = predictor.GetOutput(0);
return out;
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <gflags/gflags.h>
#include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/core/op_registry.h"
DECLARE_string(model_dir);
DECLARE_string(optimized_model);
namespace paddle {
namespace lite {
const lite::Tensor* RunHvyModel();
} // namespace lite
} // namespace paddle
...@@ -65,7 +65,59 @@ void elementwise_add<float>(const float* dinx, const float* diny, float* dout, ...@@ -65,7 +65,59 @@ void elementwise_add<float>(const float* dinx, const float* diny, float* dout,
} }
template <> template <>
void elementwise_add_axis<float>(const float* dinx, const float* diny, void elementwise_add_relu<float>(const float* dinx, const float* diny,
float* dout, int num) {
int cnt = num >> 4;
int remain = num % 16;
float32x4_t vzero = vdupq_n_f32(0.f);
#pragma omp parallel for
for (int i = 0; i < cnt; i++) {
const float* dinx_ptr = dinx + (i << 4);
const float* diny_ptr = diny + (i << 4);
float* dout_ptr = dout + (i << 4);
float32x4_t dinx0 = vld1q_f32(dinx_ptr);
float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
float32x4_t diny0 = vld1q_f32(diny_ptr);
float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
dinx0 = vaddq_f32(dinx0, diny0);
dinx1 = vaddq_f32(dinx1, diny1);
dinx2 = vaddq_f32(dinx2, diny2);
dinx3 = vaddq_f32(dinx3, diny3);
// relu
dinx0 = vmaxq_f32(dinx0, vzero);
dinx1 = vmaxq_f32(dinx1, vzero);
dinx2 = vmaxq_f32(dinx2, vzero);
dinx3 = vmaxq_f32(dinx3, vzero);
vst1q_f32(dout_ptr, dinx0);
vst1q_f32(dout_ptr + 4, dinx1);
vst1q_f32(dout_ptr + 8, dinx2);
vst1q_f32(dout_ptr + 12, dinx3);
}
if (remain > 0) {
const float* dinx_ptr = dinx + (cnt << 4);
const float* diny_ptr = diny + (cnt << 4);
float* dout_ptr = dout + (cnt << 4);
for (int i = 0; i < remain; i++) {
float tmp = *dinx_ptr + *diny_ptr;
*dout_ptr = tmp > 0.f ? tmp : 0.f;
dout_ptr++;
dinx_ptr++;
diny_ptr++;
}
}
}
template <>
void elementwise_add_broadcast<float>(const float* dinx, const float* diny,
float* dout, int batch, int channels, float* dout, int batch, int channels,
int num) { int num) {
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
...@@ -127,6 +179,82 @@ void elementwise_add_axis<float>(const float* dinx, const float* diny, ...@@ -127,6 +179,82 @@ void elementwise_add_axis<float>(const float* dinx, const float* diny,
} }
} }
template <>
void elementwise_add_relu_broadcast<float>(const float* dinx, const float* diny,
float* dout, int batch, int channels,
int num) {
float32x4_t vzero = vdupq_n_f32(0.f);
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const float* din_ptr = dinx + offset;
const float diny_data = diny[j];
float* dout_ptr = dout + offset;
int cnt = num >> 4;
int remain = num % 16;
float32x4_t rb = vdupq_n_f32(diny_data);
for (int k = 0; k < cnt; ++k) {
float32x4_t din0 = vld1q_f32(din_ptr);
float32x4_t din1 = vld1q_f32(din_ptr + 4);
float32x4_t din2 = vld1q_f32(din_ptr + 8);
float32x4_t din3 = vld1q_f32(din_ptr + 12);
din0 = vaddq_f32(din0, rb);
din1 = vaddq_f32(din1, rb);
din2 = vaddq_f32(din2, rb);
din3 = vaddq_f32(din3, rb);
// relu
din0 = vmaxq_f32(din0, vzero);
din1 = vmaxq_f32(din1, vzero);
din2 = vmaxq_f32(din2, vzero);
din3 = vmaxq_f32(din3, vzero);
vst1q_f32(dout_ptr, din0);
vst1q_f32(dout_ptr + 4, din1);
vst1q_f32(dout_ptr + 8, din2);
vst1q_f32(dout_ptr + 12, din3);
din_ptr += 16;
dout_ptr += 16;
}
if (remain >= 8) {
float32x4_t din0 = vld1q_f32(din_ptr);
float32x4_t din1 = vld1q_f32(din_ptr + 4);
din0 = vaddq_f32(din0, rb);
din1 = vaddq_f32(din1, rb);
// relu
din0 = vmaxq_f32(din0, vzero);
din1 = vmaxq_f32(din1, vzero);
vst1q_f32(dout_ptr, din0);
vst1q_f32(dout_ptr + 4, din1);
din_ptr += 8;
dout_ptr += 8;
remain -= 8;
}
if (remain >= 4) {
float32x4_t din0 = vld1q_f32(din_ptr);
din0 = vaddq_f32(din0, rb);
// relu
din0 = vmaxq_f32(din0, vzero);
vst1q_f32(dout_ptr, din0);
din_ptr += 4;
dout_ptr += 4;
remain -= 4;
}
if (remain > 0) {
for (int p = 0; p < remain; p++) {
float tmp = *din_ptr + diny_data;
*dout_ptr = tmp > 0.f ? tmp : 0.f;
dout_ptr++;
din_ptr++;
}
}
}
}
}
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
...@@ -23,9 +23,16 @@ template <typename T> ...@@ -23,9 +23,16 @@ template <typename T>
void elementwise_add(const T* dinx, const T* diny, T* dout, int num); void elementwise_add(const T* dinx, const T* diny, T* dout, int num);
template <typename T> template <typename T>
void elementwise_add_axis(const T* dinx, const T* diny, T* dout, int batch, void elementwise_add_relu(const T* dinx, const T* diny, T* dout, int num);
template <typename T>
void elementwise_add_broadcast(const T* dinx, const T* diny, T* dout, int batch,
int channels, int num); int channels, int num);
template <typename T>
void elementwise_add_relu_broadcast(const T* dinx, const T* diny, T* dout,
int batch, int channels, int num);
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
if (WITH_TESTING) if (WITH_TESTING)
cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest) cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest gflags)
endif() endif()
lite_cc_library(target_wrapper_lite SRCS target_wrapper.cc lite_cc_library(target_wrapper_lite SRCS target_wrapper.cc
DEPS target_wrapper_host DEPS target_wrapper_host
...@@ -59,4 +59,3 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li ...@@ -59,4 +59,3 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li
lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite) lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)
lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite) lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite)
lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator) lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator)
...@@ -86,6 +86,7 @@ class TensorHvy : public TensorBase<TensorHvy> { ...@@ -86,6 +86,7 @@ class TensorHvy : public TensorBase<TensorHvy> {
template <typename T> template <typename T>
T* mutable_data() { T* mutable_data() {
memory_size_ = framework::product(data_.dims()) * sizeof(T);
return data_.mutable_data<T>(data_.dims(), platform::CPUPlace()); return data_.mutable_data<T>(data_.dims(), platform::CPUPlace());
} }
template <typename T> template <typename T>
...@@ -128,8 +129,11 @@ class TensorHvy : public TensorBase<TensorHvy> { ...@@ -128,8 +129,11 @@ class TensorHvy : public TensorBase<TensorHvy> {
const framework::LoDTensor& raw_tensor() const { return data_; } const framework::LoDTensor& raw_tensor() const { return data_; }
framework::LoDTensor& raw_tensor() { return data_; } framework::LoDTensor& raw_tensor() { return data_; }
size_t memory_size() const { return memory_size_; }
private: private:
framework::LoDTensor data_; framework::LoDTensor data_;
size_t memory_size_{};
}; };
} // namespace lite } // namespace lite
......
...@@ -90,6 +90,8 @@ class TensorLite : public TensorBase<TensorLite> { ...@@ -90,6 +90,8 @@ class TensorLite : public TensorBase<TensorLite> {
void *mutable_data(size_t memory_size); void *mutable_data(size_t memory_size);
void *mutable_data(TargetType target, size_t memory_size); void *mutable_data(TargetType target, size_t memory_size);
const void *raw_data() const { return buffer_->data(); }
size_t memory_size() const { return memory_size_; } size_t memory_size() const { return memory_size_; }
bool IsInitialized() const { return buffer_->data(); } bool IsInitialized() const { return buffer_->data(); }
......
...@@ -5,11 +5,15 @@ cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir ...@@ -5,11 +5,15 @@ cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir
cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager) cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager)
add_subdirectory(fusion) add_subdirectory(fusion)
add_subdirectory(elimination)
cc_library(mir_passes cc_library(mir_passes
SRCS fc_fuse_pass.cc SRCS
conv_elementwise_add_activation_fuse_pass.cc fusion/fc_fuse_pass.cc
elementwise_add_activation_fuse_pass.cc fusion/conv_elementwise_add_activation_fuse_pass.cc
conv_bn_fuse_pass.cc fusion/conv_bn_fuse_pass.cc
fusion/elementwise_add_activation_fuse_pass.cc
elimination/identity_scale_eliminate_pass.cc
static_kernel_pick_pass.cc static_kernel_pick_pass.cc
variable_place_inference_pass.cc variable_place_inference_pass.cc
type_target_transform_pass.cc type_target_transform_pass.cc
...@@ -73,7 +77,7 @@ message(STATUS "----> Ops lite: ${ops_lite}") ...@@ -73,7 +77,7 @@ message(STATUS "----> Ops lite: ${ops_lite}")
message(STATUS "----> Host kernels: ${host_kernels}") message(STATUS "----> Host kernels: ${host_kernels}")
message(STATUS "----> X86 kernels: ${x86_kernels}") message(STATUS "----> X86 kernels: ${x86_kernels}")
lite_cc_test(test_lite_fc_fuse SRCS fc_fuse_pass_test.cc lite_cc_test(test_lite_fc_fuse SRCS fusion/fc_fuse_pass_test.cc
DEPS cxx_api_lite mir_passes DEPS cxx_api_lite mir_passes
${ops_lite} ${host_kernels} ${x86_kernels} ${arm_kernels} ${ops_lite} ${host_kernels} ${x86_kernels} ${arm_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_fc_model ARGS --model_dir=${LITE_MODEL_DIR}/lite_fc_model
...@@ -84,10 +88,10 @@ add_dependencies(test_lite_fc_fuse extern_lite_download_lite_fc_model_tar_gz) ...@@ -84,10 +88,10 @@ add_dependencies(test_lite_fc_fuse extern_lite_download_lite_fc_model_tar_gz)
lite_cc_test(test_lite_conv_elementwise_add_activation_fuse lite_cc_test(test_lite_conv_elementwise_add_activation_fuse
SRCS conv_elementwise_add_activation_fuse_pass_test.cc SRCS fusion/conv_elementwise_add_activation_fuse_pass_test.cc
DEPS cxx_api_lite mir_passes DEPS cxx_api_lite mir_passes
${ops_lite} ${host_kernels} ${x86_kernels}) ${ops_lite} ${host_kernels} ${x86_kernels})
lite_cc_test(test_lite_elementwise_add_activation_fuse lite_cc_test(test_lite_elementwise_add_activation_fuse
SRCS elementwise_add_activation_fuse_pass_test.cc SRCS fusion/elementwise_add_activation_fuse_pass_test.cc
DEPS cxx_api_lite mir_passes DEPS cxx_api_lite mir_passes
${ops_lite} ${host_kernels} ${x86_kernels}) ${ops_lite} ${host_kernels} ${x86_kernels})
if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
lite_cc_test(test_identity_scale_eliminate_pass_lite
SRCS identity_scale_eliminate_pass_test.cc
DEPS mir_passes program_lite proto_desc cpp_op_desc_lite
${ops_lite}
)
endif()
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/core/mir/pass.h"
#include "paddle/fluid/lite/core/mir/pass_registry.h"
#include "paddle/fluid/lite/core/mir/pattern_matcher_high_api.h"
namespace paddle {
namespace lite {
namespace mir {
namespace {
class Eliminator : public FuseBase {
public:
void BuildPattern() override {
auto* pre_op = OpNode("preop"); // the previous op's output need update
// TODO(Superjomn) check has only one output
auto* x = VarNode("x")->assert_is_op_input("scale", "X");
auto* scale_op = OpNode("scale", "scale")
->assert_op_attr<float>("scale", 1.)
->assert_op_attr<float>("bias", 0.);
auto* out = VarNode("out")->assert_is_op_output("scale", "Out");
*pre_op >> *x >> *scale_op >> *out;
// The pre_op will be eliminated, and a new output-updated op will insert.
x->AsIntermediate(); // x is pre_op's output, need to update
}
private:
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
auto& pre_op = matched.at("preop")->AsStmt();
auto op_info = *pre_op.op_info();
op_info.UpdateAllOutputs(matched.at("x")->AsArg().name,
matched.at("out")->AsArg().name);
pre_op.ResetOp(op_info, graph->valid_places());
GraphSafeRemoveNodes(graph, {matched.at("scale")});
IR_NODE_LINK_TO(matched.at("preop"), matched.at("out"));
}
};
} // namespace
class IdentityScaleEliminatePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override {
Eliminator eliminator;
eliminator(graph.get());
}
};
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(identity_scale_eliminate_pass,
paddle::lite::mir::IdentityScaleEliminatePass);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
#include "paddle/fluid/lite/core/mir/pass_registry.h"
#include "paddle/fluid/lite/core/mir/ssa_graph.h"
namespace paddle {
namespace lite {
namespace mir {
std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
const std::shared_ptr<Scope>& scope,
const std::vector<Place>& valid_places) {
// Op list:
// (x)->feed -> (feed) -> scale -> (scale_out) -> fetch->(fetch)
// After pass
// (x)->feed->(scale_out)->fetch->(fetch)
auto* main_block = program_desc->MutableBlock(0);
auto* feed_op = main_block->AppendOp();
auto* scale_op = main_block->AppendOp();
auto* fetch_op = main_block->AppendOp();
main_block->Var("x");
main_block->Var("feed");
main_block->Var("scale_out");
main_block->Var("fetch_out");
scope->Var("x")->GetMutable<lite::Tensor>();
scope->Var("feed")->GetMutable<lite::Tensor>();
scope->Var("scale_out")->GetMutable<lite::Tensor>();
scope->Var("fetch_out")->GetMutable<lite::Tensor>();
feed_op->SetType("feed");
feed_op->SetInput("X", {"x"});
feed_op->SetAttr("col", 1);
feed_op->SetOutput("Out", {"feed"});
scale_op->SetType("scale");
scale_op->SetInput("X", {"feed"});
scale_op->SetOutput("Out", {"scale_out"});
scale_op->SetAttr("scale", 1.f);
scale_op->SetAttr("bias", 0.f);
scale_op->SetAttr("bias_after_scale", true);
fetch_op->SetType("fetch");
fetch_op->SetInput("X", {"scale_out"});
fetch_op->SetOutput("Out", {"fetch"});
fetch_op->SetAttr("col", 1);
program_desc->Flush();
lite::Program program(*program_desc->Proto(), scope, valid_places);
auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
graph->Build(program, valid_places);
LOG(INFO) << Visualize(graph.get());
return graph;
}
TEST(identity_test, test) {
framework::ProgramDesc program_desc;
std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
auto scope = std::make_shared<Scope>();
auto graph = BuildGraph(&program_desc, scope, places);
const int num_nodes = graph->nodes().size();
auto pass = PassManager::Global().LookUp("identity_scale_eliminate_pass");
ASSERT_TRUE(pass);
pass->Apply(graph);
ASSERT_EQ(graph->nodes().size(), num_nodes - 2UL);
}
} // namespace mir
} // namespace lite
} // namespace paddle
USE_LITE_OP(feed)
USE_LITE_OP(fetch)
USE_LITE_OP(scale)
USE_MIR_PASS(identity_scale_eliminate_pass)
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/core/mir/conv_bn_fuse_pass.h" #include "paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass.h"
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.h" #include "paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.h"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/core/mir/conv_bn_fuse_pass.h" #include "paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass.h"
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <vector> #include <vector>
......
...@@ -70,7 +70,7 @@ void ConvBNFuser::BuildPattern() { ...@@ -70,7 +70,7 @@ void ConvBNFuser::BuildPattern() {
void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
auto op_desc = GenOpDesc(matched); auto op_desc = GenOpDesc(matched);
auto eltwise_op = LiteOpRegistry::Global().Create("elementwise_add"); auto eltwise_op = LiteOpRegistry::Global().Create("elementwise_add");
auto conv = matched.at("conv2d")->stmt()->op; auto conv = matched.at("conv2d")->stmt()->op();
auto* scope = conv->scope(); auto* scope = conv->scope();
auto& valid_places = conv->valid_places(); auto& valid_places = conv->valid_places();
...@@ -84,7 +84,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -84,7 +84,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
->GetMutable<lite::Tensor>(); ->GetMutable<lite::Tensor>();
size_t bias_size = bn_scale_t->data_size(); size_t bias_size = bn_scale_t->data_size();
auto bn_scale_d = bn_scale_t->mutable_data<float>(); auto bn_scale_d = bn_scale_t->mutable_data<float>();
CHECK(bias_size == conv_weight_dims[0]) CHECK_EQ(bias_size, static_cast<size_t>(conv_weight_dims[0]))
<< "The BN bias's size should be equal to the size of the first " << "The BN bias's size should be equal to the size of the first "
<< "dim size of the conv weights"; << "dim size of the conv weights";
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.h" #include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.h"
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h" #include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/core/mir/conv_elementwise_add_activation_fuse_pass.h" #include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.h"
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <vector> #include <vector>
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "paddle/fluid/lite/api/cxx_api.h" #include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/core/compatible_tensor.h" #include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h" #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
#include "paddle/fluid/lite/core/mir/passes.h" #include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/program.h" #include "paddle/fluid/lite/core/program.h"
......
...@@ -65,7 +65,7 @@ void ConvElementwiseAddActivationFuser::InsertNewNode( ...@@ -65,7 +65,7 @@ void ConvElementwiseAddActivationFuser::InsertNewNode(
SSAGraph* graph, const key2nodes_t& matched) { SSAGraph* graph, const key2nodes_t& matched) {
auto op_desc = GenOpDesc(matched); auto op_desc = GenOpDesc(matched);
auto conv_op = LiteOpRegistry::Global().Create(conv_type_); auto conv_op = LiteOpRegistry::Global().Create(conv_type_);
auto conv_old = matched.at("conv2d")->stmt()->op; auto conv_old = matched.at("conv2d")->stmt()->op();
auto* scope = conv_old->scope(); auto* scope = conv_old->scope();
auto& valid_places = conv_old->valid_places(); auto& valid_places = conv_old->valid_places();
conv_op->Attach(op_desc, scope); conv_op->Attach(op_desc, scope);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "conv_elementwise_add_relu_fuse_pass.h"
#include <memory>
#include <vector>
#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h"
#include "paddle/fluid/lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void ConvElementwiseAddReLUFusePass::Apply(
const std::unique_ptr<SSAGraph>& graph) {
fusion::ConvElementwiseAddReLUFuser fuser("conv2d");
fuser(graph.get());
fusion::ConvElementwiseAddReLUFuser depthwise_fuser("depthwise_conv2d");
depthwise_fuser(graph.get());
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(lite_conv_elementwise_add_act_fuse_pass,
paddle::lite::mir::ConvElementwiseAddReLUFusePass);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include "paddle/fluid/lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace mir {
class ConvElementwiseAddReLUFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "conv_elementwise_add_relu_fuse_pass.h"
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
#include "paddle/fluid/lite/core/mir/passes.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/program.h"
DEFINE_string(model_dir, "", "");
DEFINE_string(optimized_model, "", "");
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
const std::shared_ptr<Scope>& scope,
const std::vector<Place>& valid_places) {
auto* main_block = program_desc->MutableBlock(0);
auto* conv2d_1 = main_block->AppendOp();
auto* conv2d_2 = main_block->AppendOp();
auto* add_1 = main_block->AppendOp();
auto* relu_1 = main_block->AppendOp();
auto* add_2 = main_block->AppendOp();
auto* relu_2 = main_block->AppendOp();
main_block->Var("input_1");
main_block->Var("input_2");
main_block->Var("filter_1");
main_block->Var("filter_2");
main_block->Var("conv2d_1_out");
main_block->Var("conv2d_2_out");
main_block->Var("bias_1");
main_block->Var("add_1_out");
main_block->Var("add_2_out");
main_block->Var("relu_1_out");
main_block->Var("out");
scope->Var("input_1")->GetMutable<lite::Tensor>();
scope->Var("input_2")->GetMutable<lite::Tensor>();
scope->Var("filter_1")->GetMutable<lite::Tensor>();
scope->Var("filter_2")->GetMutable<lite::Tensor>();
scope->Var("conv2d_1_out")->GetMutable<lite::Tensor>();
scope->Var("conv2d_2_out")->GetMutable<lite::Tensor>();
scope->Var("bias_1")->GetMutable<lite::Tensor>();
scope->Var("add_1_out")->GetMutable<lite::Tensor>();
scope->Var("add_2_out")->GetMutable<lite::Tensor>();
scope->Var("relu_1_out")->GetMutable<lite::Tensor>();
scope->Var("out")->GetMutable<lite::Tensor>();
conv2d_1->SetType("conv2d");
conv2d_1->SetInput("Input", {"input_1"});
conv2d_1->SetInput("Filter", {"filter_1"});
conv2d_1->SetOutput("Output", {"conv2d_1_out"});
conv2d_1->SetAttr("strides", std::vector<int>({1, 1}));
conv2d_1->SetAttr("paddings", std::vector<int>({0, 0}));
conv2d_1->SetAttr("groups", 1);
conv2d_1->SetAttr("dilations", std::vector<int>({1, 1}));
conv2d_1->SetAttr("fuse_relu", false);
add_1->SetType("elementwise_add");
add_1->SetInput("X", {"conv2d_1_out"});
add_1->SetInput("Y", {"bias_1"});
add_1->SetOutput("Out", {"add_1_out"});
add_1->SetAttr("axis", 1);
relu_1->SetType("relu");
relu_1->SetInput("X", {"add_1_out"});
relu_1->SetOutput("Out", {"relu_1_out"});
conv2d_2->SetType("conv2d");
conv2d_2->SetInput("Input", {"input_2"});
conv2d_2->SetInput("Filter", {"filter_2"});
conv2d_2->SetOutput("Output", {"conv2d_2_out"});
conv2d_2->SetAttr("strides", std::vector<int>({1, 1}));
conv2d_2->SetAttr("paddings", std::vector<int>({0, 0}));
conv2d_2->SetAttr("groups", 1);
conv2d_2->SetAttr("dilations", std::vector<int>({1, 1}));
conv2d_2->SetAttr("fuse_relu", false);
add_2->SetType("elementwise_add");
add_2->SetInput("X", {"conv2d_2_out"});
add_2->SetInput("Y", {"relu_1_out"});
add_2->SetOutput("Out", {"add_2_out"});
add_2->SetAttr("axis", 1);
relu_2->SetType("relu");
relu_2->SetInput("X", {"add_2_out"});
relu_2->SetOutput("Out", {"out"});
program_desc->Flush();
lite::Program program(*program_desc->Proto(), scope, valid_places);
auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
graph->Build(program, valid_places);
return graph;
}
TEST(conv_elementwise_add_relu_fuse_pass, graph_test) {
framework::ProgramDesc program_desc;
std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
auto scope = std::make_shared<Scope>();
auto graph = BuildGraph(&program_desc, scope, places);
Visualize(graph.get());
ASSERT_EQ(graph->nodes().size(), 11UL /*vars*/ + 6UL /*ops*/);
Visualize(graph.get());
}
TEST(conv_elementwise_add_relu_fuse_pass, fuse_test_op) {
framework::ProgramDesc program_desc;
std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
auto scope = std::make_shared<Scope>();
auto graph = BuildGraph(&program_desc, scope, places);
Visualize(graph.get());
const int num_nodes = graph->nodes().size();
auto* fuser = new ConvElementwiseAddReLUFusePass;
fuser->Apply(graph);
Visualize(graph.get());
ASSERT_EQ(graph->nodes().size(), num_nodes - 5UL * 2 /*nodes removed */ +
1UL * 2 /* fused fc node*/);
}
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
USE_LITE_OP(elementwise_add);
USE_LITE_OP(conv2d);
USE_LITE_OP(depthwise_conv2d);
USE_LITE_OP(relu);
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.h" #include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h"
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h" #include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/core/mir/elementwise_add_activation_fuse_pass.h" #include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h"
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <vector> #include <vector>
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include "paddle/fluid/lite/api/cxx_api.h" #include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/core/compatible_tensor.h" #include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h" #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
#include "paddle/fluid/lite/core/mir/passes.h" #include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/program.h" #include "paddle/fluid/lite/core/program.h"
......
...@@ -54,7 +54,7 @@ void ElementwiseAddActivationFuser::InsertNewNode(SSAGraph* graph, ...@@ -54,7 +54,7 @@ void ElementwiseAddActivationFuser::InsertNewNode(SSAGraph* graph,
auto op_desc = GenOpDesc(matched); auto op_desc = GenOpDesc(matched);
auto op = auto op =
LiteOpRegistry::Global().Create("fusion_elementwise_add_activation"); LiteOpRegistry::Global().Create("fusion_elementwise_add_activation");
auto old_op = matched.at("add")->stmt()->op; auto old_op = matched.at("add")->stmt()->op();
auto* scope = old_op->scope(); auto* scope = old_op->scope();
auto& valid_places = old_op->valid_places(); auto& valid_places = old_op->valid_places();
op->Attach(op_desc, scope); op->Attach(op_desc, scope);
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/core/mir/fc_fuse_pass.h" #include "paddle/fluid/lite/core/mir/fusion/fc_fuse_pass.h"
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/lite/core/mir/fusion/fc_fuser.h" #include "paddle/fluid/lite/core/mir/fusion/fc_fuser.h"
......
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/core/mir/fc_fuse_pass.h" #include "fc_fuse_pass.h"
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <vector> #include <vector>
#include "paddle/fluid/lite/api/cxx_api.h" #include "paddle/fluid/lite/api/cxx_api.h"
#include "paddle/fluid/lite/core/mir/passes.h" #include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/core/op_registry.h"
DEFINE_string(model_dir, "", ""); DEFINE_string(model_dir, "", "");
......
...@@ -46,7 +46,7 @@ void FcFuser::BuildPattern() { ...@@ -46,7 +46,7 @@ void FcFuser::BuildPattern() {
void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
auto op_desc = GenOpDesc(matched); auto op_desc = GenOpDesc(matched);
auto fc_op = LiteOpRegistry::Global().Create("fc"); auto fc_op = LiteOpRegistry::Global().Create("fc");
auto mul = matched.at("mul")->stmt()->op; auto mul = matched.at("mul")->stmt()->op();
auto* scope = mul->scope(); auto* scope = mul->scope();
auto& valid_places = mul->valid_places(); auto& valid_places = mul->valid_places();
fc_op->Attach(op_desc, scope); fc_op->Attach(op_desc, scope);
......
...@@ -24,12 +24,12 @@ namespace lite { ...@@ -24,12 +24,12 @@ namespace lite {
namespace mir { namespace mir {
void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) { void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
LOG(INFO) << "final program \n" << Visualize(graph.get()); VLOG(4) << "final program \n" << Visualize(graph.get());
for (auto& item : graph->StmtTopologicalOrder()) { for (auto& item : graph->StmtTopologicalOrder()) {
if (item->IsStmt()) { if (item->IsStmt()) {
auto& stmt = item->AsStmt(); auto& stmt = item->AsStmt();
VLOG(4) << stmt; VLOG(4) << stmt;
insts_.emplace_back(stmt.op, std::move(stmt.valid_kernels.front())); insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
} }
} }
} }
......
...@@ -39,7 +39,7 @@ std::string Visualize(mir::SSAGraph* graph) { ...@@ -39,7 +39,7 @@ std::string Visualize(mir::SSAGraph* graph) {
if (node.IsArg()) { if (node.IsArg()) {
key = node.AsArg().name; key = node.AsArg().name;
} else { } else {
key = node.AsStmt().op_type + std::to_string(id++); key = node.AsStmt().op_type() + std::to_string(id++);
} }
if (node.IsStmt()) { if (node.IsStmt()) {
......
...@@ -25,11 +25,11 @@ class IoCopyKernelPickPass : public StmtPass { ...@@ -25,11 +25,11 @@ class IoCopyKernelPickPass : public StmtPass {
for (auto& node : graph->mutable_nodes()) { for (auto& node : graph->mutable_nodes()) {
if (!node.IsStmt()) continue; if (!node.IsStmt()) continue;
auto& inst = node.AsStmt(); auto& inst = node.AsStmt();
if (inst.op_type != "io_copy") continue; if (inst.op_type() != "io_copy") continue;
LOG(INFO) << "....> picking a IO COPY kernel"; LOG(INFO) << "....> picking a IO COPY kernel";
auto& kernels = node.AsStmt().valid_kernels; auto& kernels = node.AsStmt().kernels();
CHECK(!kernels.empty()) << "No valid kernels found for IoCopy Op"; CHECK(!kernels.empty()) << "No valid kernels found for IoCopy Op";
const auto* inty = node.inlinks.front()->AsArg().type; const auto* inty = node.inlinks.front()->AsArg().type;
const auto* outy = node.outlinks.front()->AsArg().type; const auto* outy = node.outlinks.front()->AsArg().type;
......
...@@ -13,3 +13,62 @@ ...@@ -13,3 +13,62 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/core/mir/node.h" #include "paddle/fluid/lite/core/mir/node.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
const OpInfo *mir::Node::Stmt::op_info() const {
CHECK(op_);
return op_->op_info();
}
Place mir::Node::Stmt::place() const {
CHECK(!valid_kernels_.empty());
return valid_kernels_.front()->place();
}
KernelBase &mir::Node::Stmt::picked_kernel() {
CHECK(!valid_kernels_.empty()) << "no kernel for " << op_type();
return *valid_kernels_.front();
}
OpInfo *mir::Node::Stmt::mutable_op_info() {
CHECK(op_);
return op_->mutable_op_info();
}
void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc,
const std::vector<Place> &valid_places,
lite::Scope *scope) {
CHECK((op_ && op_->scope()) || scope) << "Either scope should be set";
lite::Scope *the_scope = scope ? scope : op_->scope();
op_->Attach(op_desc, the_scope);
// Recreate the kernels with the latest OpInfo.
valid_kernels_.clear();
if (!op_ || op_->op_info()->Type() != op_desc.Type()) {
op_ = LiteOpRegistry::Global().Create(op_desc.Type());
CHECK(op_) << "No op found for " << op_desc.Type();
}
valid_kernels_ = op_->CreateKernels(valid_places);
}
std::ostream &mir::operator<<(std::ostream &os, const mir::Node::Stmt &other) {
os << "Statement " << other.op_type() << " " << other.place();
return os;
}
mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) {
auto &x = AsArg();
x.name = name;
x.id = id;
return x;
}
mir::Node::Arg &mir::Node::AsArg(const std::string &name) {
auto &x = AsArg();
x.name = name;
return x;
}
} // namespace lite
} // namespace paddle
...@@ -41,32 +41,40 @@ class Node { ...@@ -41,32 +41,40 @@ class Node {
kUnk, kUnk,
}; };
struct Stmt { class Stmt {
std::string op_type;
// The kernel instances this Statement contains. // The kernel instances this Statement contains.
std::vector<std::unique_ptr<KernelBase>> valid_kernels; std::vector<std::unique_ptr<KernelBase>> valid_kernels_;
// TODO(Superjomn) make this a shared_ptr for resource safety. // TODO(Superjomn) make this a shared_ptr for resource safety.
std::shared_ptr<OpLite> op; // we hold op to run InferShape std::shared_ptr<OpLite> op_; // we hold op to run InferShape
const OpInfo* op_info() { public:
CHECK(op); // Refresh the operator and kernels with the latest OpInfo.
return op->op_info(); void ResetOp(const cpp::OpDesc& op_desc,
} const std::vector<Place>& valid_places,
lite::Scope* scope = nullptr);
Place place() const { std::string op_type() const { return op_info()->Type(); }
CHECK(!valid_kernels.empty()); const OpInfo* op_info() const;
return valid_kernels.front()->place(); OpInfo* mutable_op_info();
}
KernelBase& picked_kernel() { void SetKernels(std::vector<std::unique_ptr<KernelBase>>&& kernels) {
CHECK(!valid_kernels.empty()) << "no kernel for " << op_type; valid_kernels_ = std::move(kernels);
return *valid_kernels.front();
} }
std::vector<std::unique_ptr<KernelBase>>& kernels() {
friend std::ostream& operator<<(std::ostream& os, const Stmt& other) { return valid_kernels_;
os << "Statement " << other.op_type << " " << other.place();
return os;
} }
void SetOp(const std::shared_ptr<OpLite>& op) { op_ = op; }
const std::shared_ptr<OpLite> op() const { return op_; }
Place place() const;
KernelBase& picked_kernel();
friend std::ostream& operator<<(std::ostream& os, const Stmt& other);
// Description.
std::string desc;
}; };
struct Arg { struct Arg {
...@@ -78,26 +86,16 @@ class Node { ...@@ -78,26 +86,16 @@ class Node {
bool is_weight{false}; bool is_weight{false};
}; };
Arg& AsArg(const std::string& name, int id) { Arg& AsArg(const std::string& name, int id);
auto& x = AsArg();
x.name = name;
x.id = id;
return x;
}
Arg& AsArg(const std::string& name) { Arg& AsArg(const std::string& name);
auto& x = AsArg();
x.name = name;
return x;
}
Stmt& AsStmt(const std::string& op_type, Stmt& AsStmt(const std::string& op_type,
std::vector<std::unique_ptr<KernelBase>>&& kernels, std::vector<std::unique_ptr<KernelBase>>&& kernels,
const std::shared_ptr<OpLite>& op) { const std::shared_ptr<OpLite>& op) {
auto& x = AsStmt(); auto& x = AsStmt();
x.op_type = op_type; x.SetOp(op);
x.op = op; x.SetKernels(std::move(kernels));
x.valid_kernels = std::move(kernels);
return x; return x;
} }
...@@ -142,7 +140,7 @@ class Node { ...@@ -142,7 +140,7 @@ class Node {
} }
if (other.IsStmt()) { if (other.IsStmt()) {
auto& arg = other.AsStmt(); auto& arg = other.AsStmt();
os << "Statement " << arg.op_type; os << "Statement " << arg.op_type();
} }
return os; return os;
} }
......
...@@ -139,14 +139,13 @@ struct PMNode { ...@@ -139,14 +139,13 @@ struct PMNode {
template <typename T> template <typename T>
PMNode* assert_op_attr(const std::string& attr_name, const T& attr) { PMNode* assert_op_attr(const std::string& attr_name, const T& attr) {
asserts_.emplace_back([=](Node* x) { asserts_.push_back([=](const Node* x) {
if (x && x->IsStmt()) { if (x && x->IsStmt()) {
auto* op_info = x->stmt()->op_info(); auto* op_info = x->stmt()->op_info();
return op_info->HasAttr(attr_name) && return op_info->HasAttr(attr_name) &&
op_info->GetAttr<T>(attr_name) == attr; op_info->GetAttr<T>(attr_name) == attr;
} else {
return false;
} }
return false;
}); });
return this; return this;
} }
......
...@@ -41,6 +41,7 @@ void FuseBase::DeleteInterNodes(SSAGraph *graph) { ...@@ -41,6 +41,7 @@ void FuseBase::DeleteInterNodes(SSAGraph *graph) {
} }
} }
LOG(INFO) << "keys: " << key2nodes_.size();
std::unordered_set<const Node *> nodes2rm; std::unordered_set<const Node *> nodes2rm;
for (auto &matched : key2nodes_) { for (auto &matched : key2nodes_) {
for (const auto &key : keys) { for (const auto &key : keys) {
......
...@@ -49,7 +49,13 @@ class FuseBase { ...@@ -49,7 +49,13 @@ class FuseBase {
virtual void BuildPattern() = 0; virtual void BuildPattern() = 0;
// Generate an operator desc with a matched subgraph. // Generate an operator desc with a matched subgraph.
virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) = 0; virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
return cpp::OpDesc();
}
PMNode* OpNode(const std::string& key) {
return GetOrCreateNode(key)->assert_is_op();
}
PMNode* OpNode(const std::string& key, const std::string& op_type); PMNode* OpNode(const std::string& key, const std::string& op_type);
......
...@@ -52,7 +52,7 @@ class FcFuser : public FuseBase { ...@@ -52,7 +52,7 @@ class FcFuser : public FuseBase {
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
auto op_desc = GenOpDesc(matched); auto op_desc = GenOpDesc(matched);
auto fc_op = LiteOpRegistry::Global().Create("fc"); auto fc_op = LiteOpRegistry::Global().Create("fc");
auto mul = matched.at("mul")->stmt()->op; auto mul = matched.at("mul")->stmt()->op();
auto* scope = mul->scope(); auto* scope = mul->scope();
auto& valid_places = mul->valid_places(); auto& valid_places = mul->valid_places();
fc_op->Attach(op_desc, scope); fc_op->Attach(op_desc, scope);
...@@ -90,7 +90,7 @@ std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc, ...@@ -90,7 +90,7 @@ std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
main_block->Var("w"); main_block->Var("w");
main_block->Var("out"); main_block->Var("out");
scope->Var("w")->GetMutable<lite::Tensor>(); scope->Var("x")->GetMutable<lite::Tensor>();
scope->Var("b")->GetMutable<lite::Tensor>(); scope->Var("b")->GetMutable<lite::Tensor>();
scope->Var("mul_out")->GetMutable<lite::Tensor>(); scope->Var("mul_out")->GetMutable<lite::Tensor>();
scope->Var("w")->GetMutable<lite::Tensor>(); scope->Var("w")->GetMutable<lite::Tensor>();
......
...@@ -23,19 +23,19 @@ namespace mir { ...@@ -23,19 +23,19 @@ namespace mir {
void BuildGraph(SSAGraph* g) { void BuildGraph(SSAGraph* g) {
g->mutable_nodes().emplace_back(); g->mutable_nodes().emplace_back();
Node& o1 = g->mutable_nodes().back(); Node& o1 = g->mutable_nodes().back();
o1.AsStmt().op_type = "op1"; o1.AsStmt().desc = "op1";
g->mutable_nodes().emplace_back(); g->mutable_nodes().emplace_back();
Node& o2 = g->mutable_nodes().back(); Node& o2 = g->mutable_nodes().back();
o2.AsStmt().op_type = "op2"; o2.AsStmt().desc = "op2";
g->mutable_nodes().emplace_back(); g->mutable_nodes().emplace_back();
Node& o3 = g->mutable_nodes().back(); Node& o3 = g->mutable_nodes().back();
o3.AsStmt().op_type = "op3"; o3.AsStmt().desc = "op3";
g->mutable_nodes().emplace_back(); g->mutable_nodes().emplace_back();
Node& o4 = g->mutable_nodes().back(); Node& o4 = g->mutable_nodes().back();
o4.AsStmt().op_type = "op4"; o4.AsStmt().desc = "op4";
g->mutable_nodes().emplace_back(); g->mutable_nodes().emplace_back();
Node& o5 = g->mutable_nodes().back(); Node& o5 = g->mutable_nodes().back();
o5.AsStmt().op_type = "op5"; o5.AsStmt().desc = "op5";
g->mutable_nodes().emplace_back(); g->mutable_nodes().emplace_back();
Node& v1 = g->mutable_nodes().back(); Node& v1 = g->mutable_nodes().back();
v1.AsArg("var1"); v1.AsArg("var1");
...@@ -108,11 +108,11 @@ TEST(PatternMatcher, MarkPMNodesInGraph) { ...@@ -108,11 +108,11 @@ TEST(PatternMatcher, MarkPMNodesInGraph) {
// v2 -> o3(a node named o3) // v2 -> o3(a node named o3)
auto* o2 = x.pattern_.NewNode([](const Node* node) { auto* o2 = x.pattern_.NewNode([](const Node* node) {
// The teller can be any condition, such as op type, or variable's shape. // The teller can be any condition, such as op type, or variable's shape.
return node && node->IsStmt() && node->stmt()->op_type == "op2"; return node && node->IsStmt() && node->stmt()->desc == "op2";
}); });
auto* o3 = x.pattern_.NewNode([](const Node* node) { auto* o3 = x.pattern_.NewNode([](const Node* node) {
// The teller can be any condition, such as op type, or variable's shape. // The teller can be any condition, such as op type, or variable's shape.
return node && node->IsStmt() && node->stmt()->op_type == "op3"; return node && node->IsStmt() && node->stmt()->desc == "op3";
}); });
auto* v2 = x.pattern_.NewNode([](const Node* node) { auto* v2 = x.pattern_.NewNode([](const Node* node) {
// The teller can be any condition, such as op type, or variable's shape. // The teller can be any condition, such as op type, or variable's shape.
...@@ -153,8 +153,8 @@ TEST(PatternMatcher, MultiSubgraph) { ...@@ -153,8 +153,8 @@ TEST(PatternMatcher, MultiSubgraph) {
// op -> var // op -> var
auto* any_op = x.mutable_pattern()->NewNode( auto* any_op = x.mutable_pattern()->NewNode(
[](const Node* node) { [](const Node* node) {
return node->IsStmt() && (node->stmt()->op_type == "op2" || return node->IsStmt() &&
node->stmt()->op_type == "op3"); (node->stmt()->desc == "op2" || node->stmt()->desc == "op3");
}, },
"OP0"); "OP0");
auto* any_var = auto* any_var =
...@@ -170,9 +170,9 @@ TEST(PatternMatcher, MultiSubgraph) { ...@@ -170,9 +170,9 @@ TEST(PatternMatcher, MultiSubgraph) {
int count = 0; int count = 0;
PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s, PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s,
SSAGraph* g) { SSAGraph* g) {
LOG(INFO) << "Detect " << s.at(any_op)->stmt()->op_type << " -> " LOG(INFO) << "Detect " << s.at(any_op)->stmt()->desc << " -> "
<< s.at(any_var)->arg()->name << " -> " << s.at(any_var)->arg()->name << " -> "
<< s.at(any_op1)->stmt()->op_type; << s.at(any_op1)->stmt()->desc;
count++; count++;
}; };
...@@ -197,12 +197,12 @@ TEST(PatternMatcher, IntermediateCheck) { ...@@ -197,12 +197,12 @@ TEST(PatternMatcher, IntermediateCheck) {
PatternMatcher matcher; PatternMatcher matcher;
auto* op2 = matcher.mutable_pattern()->NewNode( auto* op2 = matcher.mutable_pattern()->NewNode(
[](const Node* x) { [](const Node* x) {
return x && x->IsStmt() && x->stmt()->op_type == "op2"; return x && x->IsStmt() && x->stmt()->desc == "op2";
}, },
"op2"); "op2");
auto* op3 = matcher.mutable_pattern()->NewNode( auto* op3 = matcher.mutable_pattern()->NewNode(
[](const Node* x) { [](const Node* x) {
return x && x->IsStmt() && x->stmt()->op_type == "op3"; return x && x->IsStmt() && x->stmt()->desc == "op3";
}, },
"op3"); "op3");
auto* v2 = matcher.mutable_pattern() auto* v2 = matcher.mutable_pattern()
......
...@@ -24,8 +24,10 @@ namespace lite { ...@@ -24,8 +24,10 @@ namespace lite {
namespace mir { namespace mir {
bool SSAGraph::CheckBidirectionalConnection() { bool SSAGraph::CheckBidirectionalConnection() {
LOG(INFO) << "node count " << node_storage_.size(); VLOG(4) << "node count " << node_storage_.size();
for (auto &node : node_storage_) { for (auto &node : node_storage_) {
if (node.IsStmt()) VLOG(4) << node.AsStmt().op_info()->Type();
if (node.IsArg()) VLOG(4) << node.AsArg().name << " " << node.AsArg().id;
for (auto *in : node.inlinks) { for (auto *in : node.inlinks) {
CHECK(in->outlinks.end() != CHECK(in->outlinks.end() !=
std::find(in->outlinks.begin(), in->outlinks.end(), &node)); std::find(in->outlinks.begin(), in->outlinks.end(), &node));
...@@ -121,6 +123,7 @@ void SSAGraph::Build(const Program &program, ...@@ -121,6 +123,7 @@ void SSAGraph::Build(const Program &program,
std::unordered_map<std::string, mir::Node *> arg_update_node_map_; std::unordered_map<std::string, mir::Node *> arg_update_node_map_;
for (auto &op : program.ops()) { for (auto &op : program.ops()) {
VLOG(3) << op->op_info()->Type();
auto *op_node = GraphCreateInstructNode(op, valid_places); auto *op_node = GraphCreateInstructNode(op, valid_places);
for (const std::string &name : op->op_info()->input_names()) { for (const std::string &name : op->op_info()->input_names()) {
mir::Node *arg_node = nullptr; mir::Node *arg_node = nullptr;
......
...@@ -65,6 +65,10 @@ class SSAGraph : GraphBase { ...@@ -65,6 +65,10 @@ class SSAGraph : GraphBase {
Node *GraphCreateInstructNode(const std::shared_ptr<OpLite> &op, Node *GraphCreateInstructNode(const std::shared_ptr<OpLite> &op,
const std::vector<Place> &valid_places); const std::vector<Place> &valid_places);
// Device related attributes
const std::vector<Place> &valid_places() const { return valid_places_; }
void SetValidPlaces(const std::vector<Place> &x) { valid_places_ = x; }
private: private:
mir::Node *Argument(const std::string &name); mir::Node *Argument(const std::string &name);
// Check the bidirectional connection. // Check the bidirectional connection.
...@@ -89,6 +93,7 @@ class SSAGraph : GraphBase { ...@@ -89,6 +93,7 @@ class SSAGraph : GraphBase {
private: private:
std::list<mir::Node> node_storage_; std::list<mir::Node> node_storage_;
std::map<std::string, mir::Node *> arguments_; std::map<std::string, mir::Node *> arguments_;
std::vector<Place> valid_places_;
}; };
// Remove the link between a -> b. // Remove the link between a -> b.
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <memory> #include <memory>
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h" #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
#include "paddle/fluid/lite/core/mir/passes.h" #include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/program_fake_utils.h" #include "paddle/fluid/lite/core/program_fake_utils.h"
......
...@@ -37,9 +37,9 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -37,9 +37,9 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
if (!node.IsStmt()) continue; if (!node.IsStmt()) continue;
auto& instruct = node.AsStmt(); auto& instruct = node.AsStmt();
std::vector<std::pair<size_t, std::unique_ptr<KernelBase>>> scored; std::vector<std::pair<size_t, std::unique_ptr<KernelBase>>> scored;
CHECK(!instruct.valid_kernels.empty()) << "No kernels found for " CHECK(!instruct.kernels().empty()) << "No kernels found for "
<< instruct.op_type; << instruct.op_type();
for (auto&& kernel : instruct.valid_kernels) { for (auto&& kernel : instruct.kernels()) {
size_t score = KernelGrade(*kernel); size_t score = KernelGrade(*kernel);
scored.emplace_back(score, std::move(kernel)); scored.emplace_back(score, std::move(kernel));
} }
...@@ -49,9 +49,9 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -49,9 +49,9 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// Move kernel back // Move kernel back
// Just keep a single best kernel. // Just keep a single best kernel.
// TODO(Superjomn) reconsider this. // TODO(Superjomn) reconsider this.
instruct.valid_kernels.clear(); instruct.kernels().clear();
instruct.valid_kernels.emplace_back(std::move(scored.front().second)); instruct.kernels().emplace_back(std::move(scored.front().second));
VLOG(2) << "pick " << instruct.valid_kernels.front()->name(); VLOG(2) << "pick " << instruct.kernels().front()->name();
} }
} }
......
...@@ -62,7 +62,7 @@ void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph, Node* inst_node, ...@@ -62,7 +62,7 @@ void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph, Node* inst_node,
CHECK(in->AsArg().type); CHECK(in->AsArg().type);
if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) { if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
LOG(INFO) << "found Target unmatched tensor: " << in->AsArg().name LOG(INFO) << "found Target unmatched tensor: " << in->AsArg().name
<< " for kernel " << inst.op->DebugString() << " " << " for kernel " << inst.op()->DebugString() << " "
<< *in->AsArg().type << " -> " << *decl_arg_type; << *in->AsArg().type << " -> " << *decl_arg_type;
// Add an IoCopy instruction to make the input compatible with other dist. // Add an IoCopy instruction to make the input compatible with other dist.
AddIoCopyInst(*in->AsArg().type, *decl_arg_type, in, graph, inst_node, AddIoCopyInst(*in->AsArg().type, *decl_arg_type, in, graph, inst_node,
...@@ -89,7 +89,7 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -89,7 +89,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
CHECK(io_copy_op) << "create op [" << io_copy_op << "] failed"; CHECK(io_copy_op) << "create op [" << io_copy_op << "] failed";
// CHECK(io_copy_op); // CHECK(io_copy_op);
// Create the new var manually. // Create the new var manually.
inst_node->AsStmt().op->scope()->Var(io_copy_output_name); inst_node->AsStmt().op()->scope()->Var(io_copy_output_name);
// Create IoCopy Instruction. // Create IoCopy Instruction.
cpp::OpDesc op_desc; cpp::OpDesc op_desc;
...@@ -97,7 +97,7 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -97,7 +97,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
op_desc.SetInput("Input", {in->AsArg().name}); op_desc.SetInput("Input", {in->AsArg().name});
op_desc.SetOutput("Out", {io_copy_output_name}); op_desc.SetOutput("Out", {io_copy_output_name});
io_copy_op->Attach(op_desc, inst_node->AsStmt().op->scope()); io_copy_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
auto kernels = io_copy_op->CreateKernels(valid_places); auto kernels = io_copy_op->CreateKernels(valid_places);
io_copy_inst->AsStmt("io_copy", std::move(kernels), io_copy_op); io_copy_inst->AsStmt("io_copy", std::move(kernels), io_copy_op);
...@@ -113,19 +113,19 @@ void TypeTargetTransformPass::AddIoCopyInst( ...@@ -113,19 +113,19 @@ void TypeTargetTransformPass::AddIoCopyInst(
DirectedLink(io_copy_output_arg, inst_node); DirectedLink(io_copy_output_arg, inst_node);
// reset opdesc and update kernel information // reset opdesc and update kernel information
UpdateInputTo(inst_node->AsStmt().op->mutable_op_info(), in->AsArg().name, UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), in->AsArg().name,
io_copy_output_name); io_copy_output_name);
inst_node->AsStmt().op->Attach(*inst_node->AsStmt().op->op_info(), inst_node->AsStmt().ResetOp(*inst_node->AsStmt().op_info(),
inst_node->AsStmt().op->scope()); graph->valid_places());
std::string tmp; std::string tmp;
if (inst_node->AsStmt().op_info()->GetInputArgname("a", &tmp)) { if (inst_node->AsStmt().op_info()->GetInputArgname("a", &tmp)) {
CHECK(false) << "get old a " << tmp; CHECK(false) << "get old a " << tmp;
} }
for (auto& kernel : inst_node->AsStmt().valid_kernels) { for (auto& kernel : inst_node->AsStmt().kernels()) {
inst_node->AsStmt().op->AttachKernel(kernel.get()); inst_node->AsStmt().op()->AttachKernel(kernel.get());
} }
graph->CheckValid(); graph->CheckValid();
......
...@@ -15,12 +15,6 @@ ...@@ -15,12 +15,6 @@
#pragma once #pragma once
#include "paddle/fluid/lite/core/mir/pass_registry.h" #include "paddle/fluid/lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {} // namespace mir
} // namespace lite
} // namespace paddle
#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
USE_MIR_PASS(demo); USE_MIR_PASS(demo);
USE_MIR_PASS(static_kernel_pick_pass); USE_MIR_PASS(static_kernel_pick_pass);
...@@ -30,9 +24,11 @@ USE_MIR_PASS(generate_program_pass); ...@@ -30,9 +24,11 @@ USE_MIR_PASS(generate_program_pass);
USE_MIR_PASS(io_copy_kernel_pick_pass); USE_MIR_PASS(io_copy_kernel_pick_pass);
USE_MIR_PASS(argument_type_display_pass); USE_MIR_PASS(argument_type_display_pass);
#endif #endif
USE_MIR_PASS(runtime_context_assign_pass); USE_MIR_PASS(runtime_context_assign_pass);
USE_MIR_PASS(lite_conv_bn_fuse_pass); USE_MIR_PASS(lite_conv_bn_fuse_pass);
USE_MIR_PASS(graph_visualze); USE_MIR_PASS(graph_visualze);
USE_MIR_PASS(lite_fc_fuse_pass); USE_MIR_PASS(lite_fc_fuse_pass);
USE_MIR_PASS(identity_scale_eliminate_pass);
USE_MIR_PASS(lite_conv_elementwise_add_activation_fuse_pass); USE_MIR_PASS(lite_conv_elementwise_add_activation_fuse_pass);
USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass); USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass);
...@@ -39,7 +39,7 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -39,7 +39,7 @@ class VariablePlaceInferencePass : public DebugPass {
for (const auto& v : graph->inputs()) { for (const auto& v : graph->inputs()) {
// the feed op might in the inputs // the feed op might in the inputs
if (v->IsStmt()) { if (v->IsStmt()) {
LOG(INFO) << "found kernel in inputs " << v->AsStmt().op_type; LOG(INFO) << "found kernel in inputs " << v->AsStmt().op_type();
continue; continue;
} }
} }
...@@ -59,10 +59,10 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -59,10 +59,10 @@ class VariablePlaceInferencePass : public DebugPass {
for (auto& x : graph->StmtTopologicalOrder()) { for (auto& x : graph->StmtTopologicalOrder()) {
auto& inst = x->AsStmt(); auto& inst = x->AsStmt();
// The IoCopyOp is a tool operator, it won't support the type inference. // The IoCopyOp is a tool operator, it won't support the type inference.
if (inst.op_type == "io_copy") continue; if (inst.op_type() == "io_copy") continue;
// LOG(INFO) << "- inferencing type " << // LOG(INFO) << "- inferencing type " <<
// deal with inputs // deal with inputs
VLOG(4) << "inferencing op " << inst.op_type; VLOG(4) << "Infering op " << inst.op_info()->Repr();
// TODO(zhaolong): Add check if the node's name in op's arguments. // TODO(zhaolong): Add check if the node's name in op's arguments.
auto get_argname = [&]( auto get_argname = [&](
...@@ -90,12 +90,14 @@ class VariablePlaceInferencePass : public DebugPass { ...@@ -90,12 +90,14 @@ class VariablePlaceInferencePass : public DebugPass {
} }
} }
VLOG(3) << "inst " << inst.op_info()->Repr();
for (auto* x_out : x->outlinks) { for (auto* x_out : x->outlinks) {
std::string node_name = x_out->AsArg().name; std::string node_name = x_out->AsArg().name;
std::string arg_name = std::string arg_name =
get_argname(node_name, inst.op_info()->outputs()); get_argname(node_name, inst.op_info()->outputs());
CHECK(arg_name.size() > 0) << "can not found op arguments for node " CHECK(arg_name.size() > 0) << "can not found op arguments for node "
<< node_name; << node_name << " in Inst "
<< inst.op_type();
VLOG(3) << "-- output arg_name " << arg_name; VLOG(3) << "-- output arg_name " << arg_name;
auto type = inst.picked_kernel().GetOutputDeclType(arg_name); auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
if (!x_out->AsArg().type) { if (!x_out->AsArg().type) {
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
// limitations under the License. // limitations under the License.
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/lite/core/mir/passes.h" #include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/optimizer.h" #include "paddle/fluid/lite/core/optimizer.h"
#include "paddle/fluid/lite/core/program_fake_utils.h" #include "paddle/fluid/lite/core/program_fake_utils.h"
#include "paddle/fluid/lite/kernels/cuda/use_kernels.h" #include "paddle/fluid/lite/kernels/cuda/use_kernels.h"
......
...@@ -61,7 +61,6 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels( ...@@ -61,7 +61,6 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
targets.insert(place.target); targets.insert(place.target);
} }
// CHECK(!kernels.empty()) << "No kernel found for Op " << op_type_;
VLOG(2) << "op " << op_type_ << " get " << kernels.size() << " kernels"; VLOG(2) << "op " << op_type_ << " get " << kernels.size() << " kernels";
return kernels; return kernels;
} }
...@@ -83,7 +82,7 @@ bool OpLite::Attach(const cpp::OpDesc &opdesc, lite::Scope *scope) { ...@@ -83,7 +82,7 @@ bool OpLite::Attach(const cpp::OpDesc &opdesc, lite::Scope *scope) {
scope_ = scope; scope_ = scope;
op_info_.reset( op_info_.reset(
new OpInfo(opdesc)); // Force clean the out-of-date infomation. new OpInfo(opdesc)); // Force clean the out-of-date infomation.
return AttachImpl(opdesc, scope); return AttachImpl(*op_info(), scope);
} }
const Tensor *OpLite::GetTensor(lite::Scope *scope, const Tensor *OpLite::GetTensor(lite::Scope *scope,
......
...@@ -54,9 +54,7 @@ class OpLite : public Registry { ...@@ -54,9 +54,7 @@ class OpLite : public Registry {
OpLite() = default; OpLite() = default;
explicit OpLite(const std::string &type) : op_type_(type) {} explicit OpLite(const std::string &type) : op_type_(type) {}
explicit OpLite(const std::vector<Place> &valid_places) explicit OpLite(const std::vector<Place> &valid_places)
: valid_places_(valid_places) { : valid_places_(valid_places) {}
LOG(INFO) << "valid places " << valid_places.size();
}
void SetValidPlaces(const std::vector<Place> &places) { void SetValidPlaces(const std::vector<Place> &places) {
VLOG(3) << "valid places " << valid_places_.size(); VLOG(3) << "valid places " << valid_places_.size();
...@@ -199,6 +197,22 @@ class OpInfo : public cpp::OpDesc { ...@@ -199,6 +197,22 @@ class OpInfo : public cpp::OpDesc {
} }
return false; return false;
} }
void UpdateAllInputs(const std::string &from, const std::string &to) {
for (auto &item : inputs_) {
for (auto &var : item.second) {
if (var == from) var = to;
}
}
}
void UpdateAllOutputs(const std::string &from, const std::string &to) {
for (auto &item : outputs_) {
for (auto &var : item.second) {
if (var == from) var = to;
}
}
}
}; };
} // namespace lite } // namespace lite
......
...@@ -43,6 +43,8 @@ class Optimizer { ...@@ -43,6 +43,8 @@ class Optimizer {
CHECK(!graph_) << "duplicate optimize found"; CHECK(!graph_) << "duplicate optimize found";
graph_.reset(new mir::SSAGraph); graph_.reset(new mir::SSAGraph);
graph_->Build(program, valid_places); graph_->Build(program, valid_places);
graph_->SetValidPlaces(valid_places);
SpecifyKernelPickTactic(kernel_pick_factor); SpecifyKernelPickTactic(kernel_pick_factor);
InitTargetTypeTransformPass(); InitTargetTypeTransformPass();
...@@ -51,6 +53,7 @@ class Optimizer { ...@@ -51,6 +53,7 @@ class Optimizer {
"lite_conv_bn_fuse_pass", // "lite_conv_bn_fuse_pass", //
"lite_conv_elementwise_add_activation_fuse_pass", // "lite_conv_elementwise_add_activation_fuse_pass", //
"lite_fc_fuse_pass", // "lite_fc_fuse_pass", //
"identity_scale_eliminate_pass", //
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
"lite_elementwise_add_activation_fuse_pass", // "lite_elementwise_add_activation_fuse_pass", //
#endif #endif
......
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
#include <utility> #include <utility>
#include "paddle/fluid/lite/core/mir/generate_program_pass.h" #include "paddle/fluid/lite/core/mir/generate_program_pass.h"
#include "paddle/fluid/lite/core/mir/pass_manager.h" #include "paddle/fluid/lite/core/mir/pass_manager.h"
#include "paddle/fluid/lite/core/mir/passes.h"
#include "paddle/fluid/lite/core/mir/static_kernel_pick_pass.h" #include "paddle/fluid/lite/core/mir/static_kernel_pick_pass.h"
#include "paddle/fluid/lite/core/mir/use_passes.h"
#include "paddle/fluid/lite/core/program_fake_utils.h" #include "paddle/fluid/lite/core/program_fake_utils.h"
namespace paddle { namespace paddle {
......
...@@ -19,7 +19,7 @@ namespace lite { ...@@ -19,7 +19,7 @@ namespace lite {
namespace profile { namespace profile {
const int BasicTimer::data_w = 10; const int BasicTimer::data_w = 10;
const int BasicTimer::name_w = 10; const int BasicTimer::name_w = 15;
} // namespace profile } // namespace profile
} // namespace lite } // namespace lite
......
...@@ -140,7 +140,7 @@ class RuntimeProgram { ...@@ -140,7 +140,7 @@ class RuntimeProgram {
void Run() { void Run() {
for (auto& inst : instructions_) { for (auto& inst : instructions_) {
VLOG(4) << ">> Running kernel: " << inst; VLOG(3) << ">> Running kernel: " << inst.op()->op_info()->Repr();
inst.Run(); inst.Run();
} }
} }
......
...@@ -91,6 +91,18 @@ class DDimBase { ...@@ -91,6 +91,18 @@ class DDimBase {
return os; return os;
} }
friend bool operator==(const DDimBase &a, const DDimBase &b) {
if (a.size() != b.size()) return false;
for (size_t i = 0; i < a.size(); i++) {
if (a[i] != b[i]) return false;
}
return true;
}
friend bool operator!=(const DDimBase &a, const DDimBase &b) {
return !(a == b);
}
private: private:
DDimT *self() { return static_cast<DDimT *>(this); } DDimT *self() { return static_cast<DDimT *>(this); }
const DDimT *const_self() const { return static_cast<const DDimT *>(this); } const DDimT *const_self() const { return static_cast<const DDimT *>(this); }
...@@ -154,6 +166,7 @@ class TensorBase { ...@@ -154,6 +166,7 @@ class TensorBase {
const void *raw_data() const { return const_self()->data(); } const void *raw_data() const { return const_self()->data(); }
size_t data_size() const { return const_self()->dims().production(); } size_t data_size() const { return const_self()->dims().production(); }
size_t memory_size() const { return const_self()->memory_size(); }
void ShareDataWith(const TensorBase &other) { self()->ShareDataWith(other); } void ShareDataWith(const TensorBase &other) { self()->ShareDataWith(other); }
void CopyDataFrom(const TensorBase &other) { self()->CopyDataFrom(other); } void CopyDataFrom(const TensorBase &other) { self()->CopyDataFrom(other); }
...@@ -175,5 +188,12 @@ class TensorBase { ...@@ -175,5 +188,12 @@ class TensorBase {
} }
}; };
template <typename TensorT>
bool TensorCompareWith(const TensorT &a, const TensorT &b) {
if (a.dims() != b.dims()) return false;
if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
return true;
}
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -11,7 +11,7 @@ cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math ...@@ -11,7 +11,7 @@ cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math
cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(elementwise_compute_arm SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(concat_compute_arm SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(concat_compute_arm SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
...@@ -24,7 +24,7 @@ lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_comput ...@@ -24,7 +24,7 @@ lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_comput
lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm) lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm) lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm) lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm)
lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm) lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm) lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm) lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
...@@ -40,7 +40,7 @@ set(arm_kernels ...@@ -40,7 +40,7 @@ set(arm_kernels
softmax_compute_arm softmax_compute_arm
conv_compute_arm conv_compute_arm
batch_norm_compute_arm batch_norm_compute_arm
elementwise_add_compute_arm elementwise_compute_arm
pool_compute_arm pool_compute_arm
split_compute_arm split_compute_arm
concat_compute_arm concat_compute_arm
......
...@@ -12,7 +12,8 @@ ...@@ -12,7 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/kernels/arm/elementwise_add_compute.h" #include "paddle/fluid/lite/kernels/arm/elementwise_compute.h"
#include <string>
#include "paddle/fluid/lite/arm/math/funcs.h" #include "paddle/fluid/lite/arm/math/funcs.h"
namespace paddle { namespace paddle {
...@@ -20,6 +21,30 @@ namespace lite { ...@@ -20,6 +21,30 @@ namespace lite {
namespace kernels { namespace kernels {
namespace arm { namespace arm {
inline bool is_broadcast(const DDim& x_dims, const DDim& y_dims, int axis,
int* pre, int* n, int* post) {
if (axis < 0) {
axis = x_dims.size() - y_dims.size();
}
if (x_dims.size() == y_dims.size()) {
return false;
}
*pre = 1;
*n = 1;
*post = 1;
for (int i = 0; i < axis; ++i) {
(*pre) *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch.";
(*n) *= y_dims[i];
}
for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
(*post) *= x_dims[i];
}
return true;
}
void ElementwiseAddCompute::Run() { void ElementwiseAddCompute::Run() {
auto& param = Param<operators::ElementwiseParam>(); auto& param = Param<operators::ElementwiseParam>();
const float* x_data = param.X->data<float>(); const float* x_data = param.X->data<float>();
...@@ -28,27 +53,40 @@ void ElementwiseAddCompute::Run() { ...@@ -28,27 +53,40 @@ void ElementwiseAddCompute::Run() {
int axis = param.axis; int axis = param.axis;
auto x_dims = param.X->dims(); auto x_dims = param.X->dims();
auto y_dims = param.Y->dims(); auto y_dims = param.Y->dims();
if (axis < 0) { int pre, n, post;
axis = x_dims.size() - y_dims.size(); if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
} lite::arm::math::elementwise_add_broadcast(x_data, y_data, out_data, pre, n,
if (x_dims.size() == y_dims.size()) { post);
} else {
lite::arm::math::elementwise_add(x_data, y_data, out_data, lite::arm::math::elementwise_add(x_data, y_data, out_data,
x_dims.production()); x_dims.production());
} else {
int batch = 1;
int channels = 1;
int num = 1;
for (int i = 0; i < axis; ++i) {
batch *= x_dims[i];
} }
for (int i = 0; i < y_dims.size(); ++i) { }
channels *= y_dims[i];
void ElementwiseAddActivationCompute::Run() {
auto& param = Param<operators::FusionElementwiseActivationParam>();
const float* x_data = param.X->data<float>();
const float* y_data = param.Y->data<float>();
float* out_data = param.Out->mutable_data<float>();
int axis = param.axis;
std::string act_type = param.act_type;
auto x_dims = param.X->dims();
auto y_dims = param.Y->dims();
int pre, n, post;
if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
if (act_type == "relu") {
lite::arm::math::elementwise_add_relu_broadcast(x_data, y_data, out_data,
pre, n, post);
} else {
LOG(FATAL) << "unsupported Activation type: " << act_type;
} }
for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { } else {
num *= x_dims[i]; if (act_type == "relu") {
lite::arm::math::elementwise_add_relu(x_data, y_data, out_data,
x_dims.production());
} else {
LOG(FATAL) << "unsupported Activation type: " << act_type;
} }
lite::arm::math::elementwise_add_axis(x_data, y_data, out_data, batch,
channels, num);
} }
} }
...@@ -63,3 +101,11 @@ REGISTER_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, ...@@ -63,3 +101,11 @@ REGISTER_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW,
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize(); .Finalize();
REGISTER_LITE_KERNEL(
fusion_elementwise_add_activation, kARM, kFloat, kNCHW,
paddle::lite::kernels::arm::ElementwiseAddActivationCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
...@@ -30,6 +30,14 @@ class ElementwiseAddCompute ...@@ -30,6 +30,14 @@ class ElementwiseAddCompute
virtual ~ElementwiseAddCompute() = default; virtual ~ElementwiseAddCompute() = default;
}; };
class ElementwiseAddActivationCompute
: public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
void Run() override;
virtual ~ElementwiseAddActivationCompute() = default;
};
} // namespace arm } // namespace arm
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -12,8 +12,9 @@ ...@@ -12,8 +12,9 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/kernels/arm/elementwise_add_compute.h" #include "paddle/fluid/lite/kernels/arm/elementwise_compute.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string>
#include <vector> #include <vector>
#include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/core/op_registry.h"
...@@ -37,7 +38,9 @@ TEST(elementwise_add_arm, init) { ...@@ -37,7 +38,9 @@ TEST(elementwise_add_arm, init) {
} }
template <typename dtype> template <typename dtype>
void elementwise_add_compute_ref(const operators::ElementwiseParam& param) { void elementwise_compute_ref(const operators::ElementwiseParam& param,
const std::string elt_type,
const std::string act_type) {
const dtype* x_data = param.X->data<const dtype>(); const dtype* x_data = param.X->data<const dtype>();
const dtype* y_data = param.Y->data<const dtype>(); const dtype* y_data = param.Y->data<const dtype>();
dtype* out_data = param.Out->mutable_data<dtype>(); dtype* out_data = param.Out->mutable_data<dtype>();
...@@ -59,6 +62,8 @@ void elementwise_add_compute_ref(const operators::ElementwiseParam& param) { ...@@ -59,6 +62,8 @@ void elementwise_add_compute_ref(const operators::ElementwiseParam& param) {
for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
num *= x_dims[i]; num *= x_dims[i];
} }
// do elementwise add/sub/max...
if (elt_type == "add") {
for (int i = 0; i < batch; ++i) { for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) { for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num; int offset = (i * channels + j) * num;
...@@ -72,6 +77,39 @@ void elementwise_add_compute_ref(const operators::ElementwiseParam& param) { ...@@ -72,6 +77,39 @@ void elementwise_add_compute_ref(const operators::ElementwiseParam& param) {
} }
} }
} }
} else if (elt_type == "sub") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr - diny_data;
dout_ptr++;
din_ptr++;
}
}
}
} else {
LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
}
// do activation relu/sigmod...
if (act_type.size() > 0) {
if (act_type == "relu") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
dtype* dout_ptr = out_data + (i * channels + j) * num;
for (int k = 0; k < num; ++k) {
*dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
dout_ptr++;
}
}
}
} else {
LOG(FATAL) << "unsupported Activation type: " << elt_type;
}
}
} }
TEST(elementwise_add, compute) { TEST(elementwise_add, compute) {
...@@ -79,6 +117,19 @@ TEST(elementwise_add, compute) { ...@@ -79,6 +117,19 @@ TEST(elementwise_add, compute) {
operators::ElementwiseParam param; operators::ElementwiseParam param;
lite::Tensor x, y, output, output_ref; lite::Tensor x, y, output, output_ref;
#if 1
for (auto n : {1, 3, 4}) {
for (auto c : {1, 3, 4}) {
for (auto h : {1, 3, 4}) {
for (auto w : {1, 3, 4}) {
for (auto axis : {-1, 0, 1, 3}) {
for (auto yd :
{std::vector<int64_t>({n}), std::vector<int64_t>({c}),
std::vector<int64_t>({h}), std::vector<int64_t>({w}),
std::vector<int64_t>({n, c}), std::vector<int64_t>({c, h}),
std::vector<int64_t>({c, h, w}),
std::vector<int64_t>({n, c, h, w})}) {
#else
for (auto n : {1, 3, 4, 11}) { for (auto n : {1, 3, 4, 11}) {
for (auto c : {1, 3, 4, 11}) { for (auto c : {1, 3, 4, 11}) {
for (auto h : {1, 3, 4, 11}) { for (auto h : {1, 3, 4, 11}) {
...@@ -91,6 +142,7 @@ TEST(elementwise_add, compute) { ...@@ -91,6 +142,7 @@ TEST(elementwise_add, compute) {
std::vector<int64_t>({h, w}), std::vector<int64_t>({n, c, h}), std::vector<int64_t>({h, w}), std::vector<int64_t>({n, c, h}),
std::vector<int64_t>({c, h, w}), std::vector<int64_t>({c, h, w}),
std::vector<int64_t>({n, c, h, w})}) { std::vector<int64_t>({n, c, h, w})}) {
#endif
auto x_dim = DDim(std::vector<int64_t>({n, c, h, w})); auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
auto y_dim = DDim(yd); auto y_dim = DDim(yd);
int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis; int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
...@@ -123,7 +175,102 @@ TEST(elementwise_add, compute) { ...@@ -123,7 +175,102 @@ TEST(elementwise_add, compute) {
elementwise_add.SetParam(param); elementwise_add.SetParam(param);
elementwise_add.Run(); elementwise_add.Run();
param.Out = &output_ref; param.Out = &output_ref;
elementwise_add_compute_ref<float>(param); elementwise_compute_ref<float>(param, "add", "");
for (int i = 0; i < output.dims().production(); i++) {
EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
}
}
}
}
}
}
}
}
TEST(fusion_elementwise_add_activation_arm, retrive_op) {
auto fusion_elementwise_add_activation =
KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
"fusion_elementwise_add_activation");
ASSERT_FALSE(fusion_elementwise_add_activation.empty());
ASSERT_TRUE(fusion_elementwise_add_activation.front());
}
TEST(fusion_elementwise_add_activation_arm, init) {
ElementwiseAddActivationCompute fusion_elementwise_add_activation;
ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFloat));
ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kARM));
}
TEST(fusion_elementwise_add_activation_arm, compute) {
ElementwiseAddActivationCompute fusion_elementwise_add_activation;
operators::FusionElementwiseActivationParam param;
lite::Tensor x, y, output, output_ref;
#if 1
for (auto act_type : {"relu"}) {
for (auto n : {1, 3, 4}) {
for (auto c : {1, 3, 4}) {
for (auto h : {1, 3, 4}) {
for (auto w : {1, 3, 4}) {
for (auto axis : {-1, 0, 1, 3}) {
for (auto yd :
{std::vector<int64_t>({n}), std::vector<int64_t>({c}),
std::vector<int64_t>({h}), std::vector<int64_t>({w}),
std::vector<int64_t>({n, c}), std::vector<int64_t>({h, w}),
std::vector<int64_t>({n, c, h}),
std::vector<int64_t>({n, c, h, w})}) {
#else
for (auto act_type : {"relu"}) {
for (auto n : {1, 3, 4, 11}) {
for (auto c : {1, 3, 4, 11}) {
for (auto h : {1, 3, 4, 11}) {
for (auto w : {1, 3, 4, 11}) {
for (auto axis : {-1, 0, 1, 2, 3}) {
for (auto yd :
{std::vector<int64_t>({n}), std::vector<int64_t>({c}),
std::vector<int64_t>({h}), std::vector<int64_t>({w}),
std::vector<int64_t>({n, c}), std::vector<int64_t>({c, h}),
std::vector<int64_t>({h, w}),
std::vector<int64_t>({n, c, h}),
std::vector<int64_t>({c, h, w}),
std::vector<int64_t>({n, c, h, w})}) {
#endif
auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
auto y_dim = DDim(yd);
int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
if (axis_t + y_dim.size() > 4) continue;
bool flag = false;
for (int i = 0; i < y_dim.size(); i++) {
if (x_dim[i + axis_t] != y_dim[i]) flag = true;
}
if (flag) continue;
x.Resize(x_dim);
y.Resize(y_dim);
output.Resize(x_dim);
output_ref.Resize(x_dim);
auto* x_data = x.mutable_data<float>();
auto* y_data = y.mutable_data<float>();
auto* output_data = output.mutable_data<float>();
auto* output_ref_data = output_ref.mutable_data<float>();
for (int i = 0; i < x_dim.production(); i++) {
float sign = i % 3 == 0 ? -1.0f : 1.0f;
x_data[i] = i * sign;
}
for (int i = 0; i < y_dim.production(); i++) {
float sign = i % 2 == 0 ? 0.5f : -0.5f;
y_data[i] = i * sign;
}
param.X = &x;
param.Y = &y;
param.axis = axis;
param.Out = &output;
param.act_type = act_type;
fusion_elementwise_add_activation.SetParam(param);
fusion_elementwise_add_activation.Run();
param.Out = &output_ref;
elementwise_compute_ref<float>(param, "add", act_type);
for (int i = 0; i < output.dims().production(); i++) { for (int i = 0; i < output.dims().production(); i++) {
EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
} }
...@@ -133,6 +280,7 @@ TEST(elementwise_add, compute) { ...@@ -133,6 +280,7 @@ TEST(elementwise_add, compute) {
} }
} }
} }
}
} }
} // namespace arm } // namespace arm
...@@ -141,3 +289,4 @@ TEST(elementwise_add, compute) { ...@@ -141,3 +289,4 @@ TEST(elementwise_add, compute) {
} // namespace paddle } // namespace paddle
USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def); USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
...@@ -80,12 +80,19 @@ TEST(softmax_arm, compute) { ...@@ -80,12 +80,19 @@ TEST(softmax_arm, compute) {
lite::Tensor x; lite::Tensor x;
lite::Tensor output; lite::Tensor output;
lite::Tensor output_ref; lite::Tensor output_ref;
#if 1
for (auto n : {1, 3}) {
for (auto c : {1, 4}) {
for (auto h : {5, 1}) {
for (auto w : {1, 6}) {
for (auto axis : {-2, -1, 0, 1, 2}) {
#else
for (auto n : {1, 3, 4, 11}) { for (auto n : {1, 3, 4, 11}) {
for (auto c : {1, 3, 11, 4}) { for (auto c : {1, 3, 11, 4}) {
for (auto h : {3, 1, 11, 4}) { for (auto h : {3, 1, 11, 4}) {
for (auto w : {1, 3, 4, 12}) { for (auto w : {1, 3, 4, 12}) {
for (auto axis : {-4, -3, -2, -1, 0, 1, 2, 3}) { for (auto axis : {-4, -3, -2, -1, 0, 1, 2, 3}) {
#endif
x.Resize(DDim(std::vector<int64_t>({n, c, h, w}))); x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
output.Resize(DDim(std::vector<int64_t>({n, c, h, w}))); output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w}))); output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
/*
* ATTENTION this header file can only include in .cc file.
*/
USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
#ifdef LITE_WITH_X86
USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
#endif
#ifdef LITE_WITH_ARM
USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
#endif
#ifdef LITE_WITH_CUDA
USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
#endif
...@@ -18,6 +18,18 @@ cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} ) ...@@ -18,6 +18,18 @@ cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} )
cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col) cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling) cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
lite_cc_test(test_pool2d_compute_x86 SRCS pool_compute_test.cc DEPS pool_compute_x86)
lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_compute_x86)
lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86)
lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x86)
lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator)
lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
set(x86_kernels set(x86_kernels
activation_compute_x86 activation_compute_x86
elementwise_compute_x86 elementwise_compute_x86
......
...@@ -12,88 +12,7 @@ ...@@ -12,88 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <Eigen/Core> #include "paddle/fluid/lite/kernels/x86/concat_compute.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h"
#include "paddle/fluid/operators/strided_memcpy.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T>
class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ConcatParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
int64_t axis = static_cast<int64_t>(param.axis);
auto out = param.output;
if (axis == 0 && param.x.size() < 10) {
size_t output_offset = 0;
for (auto* in : param.x) {
if (!in || in->dims().production() == 0UL) {
continue;
}
auto in_stride = framework::stride_numel(in->dims().data());
auto out_stride = framework::stride_numel(out->dims().data());
paddle::operators::StridedNumelCopyWithAxis<T>(
platform::CPUDeviceContext(), axis,
out->mutable_data<T>() + output_offset, out_stride, in->data<T>(),
in_stride, in_stride[axis]);
output_offset += in_stride[axis];
}
} else {
std::vector<lite::Tensor> inputs;
for (size_t j = 0; j < param.x.size(); ++j) {
if (param.x[j] && param.x[j]->dims().production() > 0) {
inputs.push_back(*param.x[j]);
} else {
continue;
}
}
int num = inputs.size();
int rows = 1;
auto dim_0 = inputs[0].dims();
for (int i = 0; i < axis; ++i) {
rows *= dim_0[i];
}
int out_rows = rows, out_cols = 0;
std::vector<int64_t> input_cols(inputs.size());
for (int i = 0; i < num; ++i) {
int t_cols = inputs[i].dims().production() / rows;
out_cols += t_cols;
input_cols[i] = t_cols;
}
// computation
auto output_data = param.output->template mutable_data<T>();
int col_idx = 0;
for (int j = 0; j < num; ++j) {
int col_len = input_cols[j];
auto input_data = inputs[j].data<float>();
for (int k = 0; k < out_rows; ++k) {
std::memcpy(output_data + k * out_cols + col_idx,
input_data + k * col_len, sizeof(T) * col_len);
}
col_idx += col_len;
}
}
}
virtual ~ConcatCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(concat, kX86, kFloat, kNCHW, REGISTER_LITE_KERNEL(concat, kX86, kFloat, kNCHW,
paddle::lite::kernels::x86::ConcatCompute<float>, def) paddle::lite::kernels::x86::ConcatCompute<float>, def)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <Eigen/Core>
#include <vector>
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h"
#include "paddle/fluid/operators/strided_memcpy.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T>
class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ConcatParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
int64_t axis = static_cast<int64_t>(param.axis);
auto out = param.output;
if (axis == 0 && param.x.size() < 10) {
size_t output_offset = 0;
for (auto* in : param.x) {
if (!in || in->dims().production() == 0UL) {
continue;
}
auto in_stride = framework::stride_numel(in->dims().data());
auto out_stride = framework::stride_numel(out->dims().data());
paddle::operators::StridedNumelCopyWithAxis<T>(
platform::CPUDeviceContext(), axis,
out->mutable_data<T>() + output_offset, out_stride, in->data<T>(),
in_stride, in_stride[axis]);
output_offset += in_stride[axis];
}
} else {
std::vector<lite::Tensor> inputs;
for (size_t j = 0; j < param.x.size(); ++j) {
if (param.x[j] && param.x[j]->dims().production() > 0) {
inputs.push_back(*param.x[j]);
} else {
continue;
}
}
int num = inputs.size();
int rows = 1;
auto dim_0 = inputs[0].dims();
for (int i = 0; i < axis; ++i) {
rows *= dim_0[i];
}
int out_rows = rows, out_cols = 0;
std::vector<int64_t> input_cols(inputs.size());
for (int i = 0; i < num; ++i) {
int t_cols = inputs[i].dims().production() / rows;
out_cols += t_cols;
input_cols[i] = t_cols;
}
// computation
auto output_data = param.output->template mutable_data<T>();
int col_idx = 0;
for (int j = 0; j < num; ++j) {
int col_len = input_cols[j];
auto input_data = inputs[j].data<float>();
for (int k = 0; k < out_rows; ++k) {
std::memcpy(output_data + k * out_cols + col_idx,
input_data + k * col_len, sizeof(T) * col_len);
}
col_idx += col_len;
}
}
}
virtual ~ConcatCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/concat_compute.h"
#include <gtest/gtest.h>
#include <iostream>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
TEST(concat_x86, retrive_op) {
auto concat =
KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
"concat");
ASSERT_FALSE(concat.empty());
ASSERT_TRUE(concat.front());
}
TEST(concat_x86, init) {
ConcatCompute<float> concat;
ASSERT_EQ(concat.precision(), PRECISION(kFloat));
ASSERT_EQ(concat.target(), TARGET(kX86));
}
TEST(concat_x86, run_test) {
lite::Tensor x1, x2, out;
constexpr int batch_size = 1;
std::vector<int64_t> x1_shape{batch_size, 1, 3, 3};
x1.Resize(lite::DDim(x1_shape));
std::vector<int64_t> x2_shape{batch_size, 1, 3, 3};
x2.Resize(lite::DDim(x2_shape));
std::vector<lite::Tensor*> x = {&x1, &x2};
std::vector<int64_t> out_shape{batch_size, 2, 3, 3};
out.Resize(lite::DDim(out_shape));
auto x1_data = x1.mutable_data<float>();
auto x2_data = x2.mutable_data<float>();
auto out_data = out.mutable_data<float>();
for (int64_t i = 0; i < x1.dims().production(); i++) {
x1_data[i] = 1;
x2_data[i] = 2;
}
ConcatCompute<float> concat;
operators::ConcatParam param;
param.x = x;
param.output = &out;
param.axis = 1;
concat.SetParam(param);
concat.Run();
std::cout << "output: ";
for (int i = 0; i < out.dims().production(); i++) {
std::cout << out_data[i] << " ";
}
std::cout << std::endl;
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
...@@ -12,144 +12,7 @@ ...@@ -12,144 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <Eigen/Core> #include "paddle/fluid/lite/kernels/x86/conv_compute.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h"
#include "paddle/fluid/lite/operators/conv_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
inline bool IsExpand(const std::vector<int64_t>& filter_dim,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
template <typename T>
class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ConvParam;
void Run() override {
auto& param = *param_.get_mutable<operators::ConvParam>();
lite::Tensor filter = *param.filter;
param.output->template mutable_data<T>();
const int batch_size = static_cast<int>(param.x->dims()[0]);
std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
std::vector<int64_t> output_shape_vec(param.output->dims().Vectorize());
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = param.x->dims()[1] / param.groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
lite::DDim col_shape(col_shape_vec);
lite::DDim col_matrix_shape = col_shape.Flattern2D(data_dim + 1);
bool is_expand = IsExpand(filter_shape_vec, param.strides, param.paddings,
param.dilations);
lite::Tensor col;
lite::Tensor col_matrix;
if (is_expand) {
col.Resize(col_shape);
col.mutable_data<T>();
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
lite::DDim input_shape = param.x->dims().Slice(1, param.x->dims().size());
lite::DDim filter_matrix_shape(std::vector<int64_t>{
filter.dims()[0], filter.dims().production() / filter.dims()[0]});
filter.Resize(filter_matrix_shape);
lite::DDim output_matrix_shape(std::vector<int64_t>{
param.output->dims()[1],
param.output->dims().production() /
(param.output->dims()[0] * param.output->dims()[1])});
int in_step = static_cast<int>(param.x->dims()[1]) / param.groups;
int out_step = static_cast<int>(param.output->dims()[1]) / param.groups;
paddle::operators::math::Vol2ColFunctor<platform::CPUDeviceContext, T>
vol2col;
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO, platform::CPUDeviceContext, T>
im2col;
auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
platform::CPUDeviceContext());
for (int i = 0; i < batch_size; i++) {
lite::Tensor in_batch;
in_batch.ShareDataWith(
param.x->raw_tensor().Slice(i, i + 1).Resize(input_shape.data()));
lite::Tensor out_batch;
out_batch.ShareDataWith(param.output->raw_tensor().Slice(i, i + 1).Resize(
output_matrix_shape.data()));
for (int g = 0; g < param.groups; g++) {
lite::Tensor in_slice;
in_slice.ShareDataWith(
in_batch.raw_tensor().Slice(g * in_step, (g + 1) * in_step));
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
param.dilations, param.strides,
std::vector<int>{param.paddings[0], param.paddings[1],
param.paddings[0], param.paddings[1]},
&(col.raw_tensor()));
} else if (data_dim == 3U) {
// vol2col
vol2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
param.dilations, param.strides, param.paddings,
&(col.raw_tensor()));
}
// gemm
lite::Tensor out_slice;
out_slice.ShareDataWith(
out_batch.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
lite::Tensor filter_slice;
filter_slice.ShareDataWith(
filter.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
blas.MatMul(filter_slice.raw_tensor(), false, col_matrix.raw_tensor(),
false, T(1.0), &(out_slice.raw_tensor()), T(0.0));
}
}
}
virtual ~Conv2dCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, REGISTER_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW,
paddle::lite::kernels::x86::Conv2dCompute<float>, def) paddle::lite::kernels::x86::Conv2dCompute<float>, def)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <Eigen/Core>
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h"
#include "paddle/fluid/lite/operators/conv_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
inline bool IsExpand(const std::vector<int64_t>& filter_dim,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations) {
bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
for (size_t j = 0; j < strides.size(); ++j) {
filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
strides_1 = strides_1 && (strides[j] == 1);
padding_0 = padding_0 && (paddings[j] == 0);
dilation_1 = dilation_1 && (dilations[j] == 1);
}
return !(filter_1 && strides_1 && padding_0 && dilation_1);
}
template <typename T>
class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ConvParam;
void Run() override {
auto& param = *param_.get_mutable<operators::ConvParam>();
lite::Tensor filter = *param.filter;
param.output->template mutable_data<T>();
const int batch_size = static_cast<int>(param.x->dims()[0]);
std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
std::vector<int64_t> output_shape_vec(param.output->dims().Vectorize());
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = param.x->dims()[1] / param.groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
lite::DDim col_shape(col_shape_vec);
lite::DDim col_matrix_shape = col_shape.Flattern2D(data_dim + 1);
bool is_expand = IsExpand(filter_shape_vec, param.strides, param.paddings,
param.dilations);
lite::Tensor col;
lite::Tensor col_matrix;
if (is_expand) {
col.Resize(col_shape);
col.mutable_data<T>();
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
lite::DDim input_shape = param.x->dims().Slice(1, param.x->dims().size());
lite::DDim filter_matrix_shape(std::vector<int64_t>{
filter.dims()[0], filter.dims().production() / filter.dims()[0]});
filter.Resize(filter_matrix_shape);
lite::DDim output_matrix_shape(std::vector<int64_t>{
param.output->dims()[1],
param.output->dims().production() /
(param.output->dims()[0] * param.output->dims()[1])});
int in_step = static_cast<int>(param.x->dims()[1]) / param.groups;
int out_step = static_cast<int>(param.output->dims()[1]) / param.groups;
paddle::operators::math::Vol2ColFunctor<platform::CPUDeviceContext, T>
vol2col;
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO, platform::CPUDeviceContext, T>
im2col;
auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
platform::CPUDeviceContext());
for (int i = 0; i < batch_size; i++) {
lite::Tensor in_batch;
in_batch.ShareDataWith(
param.x->raw_tensor().Slice(i, i + 1).Resize(input_shape.data()));
lite::Tensor out_batch;
out_batch.ShareDataWith(param.output->raw_tensor().Slice(i, i + 1).Resize(
output_matrix_shape.data()));
for (int g = 0; g < param.groups; g++) {
lite::Tensor in_slice;
in_slice.ShareDataWith(
in_batch.raw_tensor().Slice(g * in_step, (g + 1) * in_step));
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
param.dilations, param.strides,
std::vector<int>{param.paddings[0], param.paddings[1],
param.paddings[0], param.paddings[1]},
&(col.raw_tensor()));
} else if (data_dim == 3U) {
// vol2col
vol2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
param.dilations, param.strides, param.paddings,
&(col.raw_tensor()));
}
// gemm
lite::Tensor out_slice;
out_slice.ShareDataWith(
out_batch.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
lite::Tensor filter_slice;
filter_slice.ShareDataWith(
filter.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
blas.MatMul(filter_slice.raw_tensor(), false, col_matrix.raw_tensor(),
false, T(1.0), &(out_slice.raw_tensor()), T(0.0));
}
}
}
virtual ~Conv2dCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/conv_compute.h"
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
TEST(conv_x86, retrive_op) {
auto conv2d =
KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
"conv2d");
ASSERT_FALSE(conv2d.empty());
ASSERT_TRUE(conv2d.front());
}
TEST(conv2d_x86, init) {
Conv2dCompute<float> conv2d;
ASSERT_EQ(conv2d.precision(), PRECISION(kFloat));
ASSERT_EQ(conv2d.target(), TARGET(kX86));
}
TEST(conv2d_x86, run_test) {
lite::Tensor x, filter, b, out;
constexpr int batch_size = 1;
std::vector<int64_t> x_shape{batch_size, 3, 3, 3};
x.Resize(lite::DDim(x_shape));
std::vector<int64_t> filter_shape{1, 3, 3, 3};
filter.Resize(lite::DDim(filter_shape));
std::vector<int64_t> b_shape{1, 3, 1, 1};
b.Resize(lite::DDim(b_shape));
std::vector<int64_t> out_shape{batch_size, 1, 1, 1};
out.Resize(lite::DDim(out_shape));
auto x_data = x.mutable_data<float>();
auto filter_data = filter.mutable_data<float>();
auto b_data = b.mutable_data<float>();
auto out_data = out.mutable_data<float>();
for (int64_t i = 0; i < x.dims().production(); i++) {
x_data[i] = 1;
}
for (int64_t i = 0; i < filter.dims().production(); i++) {
filter_data[i] = 1;
}
for (int64_t i = 0; i < b.dims().production(); i++) {
b_data[i] = 0;
}
Conv2dCompute<float> conv2d;
operators::ConvParam param;
param.x = &x;
param.filter = &filter;
param.bias = &b;
param.output = &out;
param.strides = {1, 1};
param.paddings = {0, 0};
param.groups = 1;
param.dilations = {1, 1};
conv2d.SetParam(param);
conv2d.Run();
LOG(INFO) << "output: ";
for (int i = 0; i < out.dims().production(); i++) {
LOG(INFO) << out_data[i] << " ";
}
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
...@@ -12,72 +12,7 @@ ...@@ -12,72 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <random> #include "paddle/fluid/lite/kernels/x86/dropout_compute.h"
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::DropoutParam;
void Run() override {
auto& param = *param_.get_mutable<operators::DropoutParam>();
const auto* x_data = param.x->data<T>();
auto* out_data = param.output->template mutable_data<T>();
if (!param.is_test) {
auto* mask_data = param.mask->template mutable_data<T>();
std::random_device rnd;
std::minstd_rand engine;
int seed = param.fix_seed ? param.seed : rnd();
engine.seed(seed);
std::uniform_real_distribution<float> dist(0, 1);
size_t size = framework::product(param.mask->dims().data());
for (size_t i = 0; i < size; ++i) {
if (dist(engine) < param.dropout_prob) {
mask_data[i] = 0;
out_data[i] = 0;
} else {
if (param.dropout_implementation == "upscale_in_train") {
mask_data[i] = 1.0f / static_cast<T>(1.0f - param.dropout_prob);
out_data[i] = x_data[i] / static_cast<T>(1.0f - param.dropout_prob);
} else {
mask_data[i] = 1;
out_data[i] = x_data[i];
}
}
}
} else {
auto X = EigenMatrix<T>::Reshape(param.x->raw_tensor(), 1);
auto Y = EigenMatrix<T>::Reshape(param.output->raw_tensor(), 1);
auto& place = *platform::CPUDeviceContext().eigen_device();
if (param.dropout_implementation == "upscale_in_train") {
Y.device(place) = X;
} else {
Y.device(place) = X * static_cast<T>(1.0f - param.dropout_prob);
}
}
}
virtual ~DropoutCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, REGISTER_LITE_KERNEL(dropout, kX86, kFloat, kNCHW,
paddle::lite::kernels::x86::DropoutCompute<float>, def) paddle::lite::kernels::x86::DropoutCompute<float>, def)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <random>
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::DropoutParam;
void Run() override {
auto& param = *param_.get_mutable<operators::DropoutParam>();
const auto* x_data = param.x->data<T>();
auto* out_data = param.output->template mutable_data<T>();
if (!param.is_test) {
auto* mask_data = param.mask->template mutable_data<T>();
std::random_device rnd;
std::minstd_rand engine;
int seed = param.fix_seed ? param.seed : rnd();
engine.seed(seed);
std::uniform_real_distribution<float> dist(0, 1);
size_t size = framework::product(param.mask->dims().data());
for (size_t i = 0; i < size; ++i) {
if (dist(engine) < param.dropout_prob) {
mask_data[i] = 0;
out_data[i] = 0;
} else {
if (param.dropout_implementation == "upscale_in_train") {
mask_data[i] = 1.0f / static_cast<T>(1.0f - param.dropout_prob);
out_data[i] = x_data[i] / static_cast<T>(1.0f - param.dropout_prob);
} else {
mask_data[i] = 1;
out_data[i] = x_data[i];
}
}
}
} else {
auto X = EigenMatrix<T>::Reshape(param.x->raw_tensor(), 1);
auto Y = EigenMatrix<T>::Reshape(param.output->raw_tensor(), 1);
auto& place = *platform::CPUDeviceContext().eigen_device();
if (param.dropout_implementation == "upscale_in_train") {
Y.device(place) = X;
} else {
Y.device(place) = X * static_cast<T>(1.0f - param.dropout_prob);
}
}
}
virtual ~DropoutCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/dropout_compute.h"
#include <gtest/gtest.h>
#include <iostream>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
TEST(dropout_x86, retrive_op) {
auto dropout =
KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
"dropout");
ASSERT_FALSE(dropout.empty());
ASSERT_TRUE(dropout.front());
}
TEST(dropout_x86, init) {
DropoutCompute<float> dropout;
ASSERT_EQ(dropout.precision(), PRECISION(kFloat));
ASSERT_EQ(dropout.target(), TARGET(kX86));
}
TEST(dropout_x86, run_test) {
lite::Tensor x, y, out;
constexpr int batch_size = 1;
std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
x.Resize(lite::DDim(x_shape));
std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
out.Resize(lite::DDim(out_shape));
auto x_data = x.mutable_data<float>();
auto out_data = out.mutable_data<float>();
for (int64_t i = 0; i < x.dims().production(); i++) {
x_data[i] = static_cast<float>(i);
}
// DropoutCompute dropout;
DropoutCompute<float> dropout;
operators::DropoutParam param;
param.x = &x;
param.dropout_prob = 0.25;
param.is_test = true;
param.fix_seed = true;
param.output = &out;
dropout.SetParam(param);
dropout.Run();
LOG(INFO) << "output: ";
for (int i = 0; i < out.dims().production(); i++) {
LOG(INFO) << out_data[i];
}
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
...@@ -12,113 +12,8 @@ ...@@ -12,113 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/lite/kernels/x86/elementwise_compute.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T>
struct SubFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
};
template <typename T>
struct AddFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
};
template <typename T>
class ElementwiseSubCompute
: public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<X86Context>();
CHECK(context.x86_device_context());
param.Out->template mutable_data<T>();
paddle::operators::ElementwiseComputeEx<SubFunctor<T>,
platform::CPUDeviceContext, T>(
*context.x86_execution_context(), &param.X->raw_tensor(),
&param.Y->raw_tensor(), param.axis, SubFunctor<T>(),
&param.Out->raw_tensor());
}
virtual ~ElementwiseSubCompute() = default;
};
template <typename T>
struct SubGradDX {
T operator()(T x, T y, T out, T dout) const { return dout; }
};
template <typename T>
struct SubGradDY {
T operator()(T x, T y, T out, T dout) const { return -dout; }
};
template <typename T>
class ElementwiseSubGradCompute
: public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseGradParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<X86Context>();
CHECK(context.x86_device_context());
param.X_grad->template mutable_data<T>();
param.Y_grad->template mutable_data<T>();
// skip out, x, y
auto dout = param.Out_grad->raw_tensor();
auto dx = param.X_grad->raw_tensor();
auto dy = param.Y_grad->raw_tensor();
auto& skip = dout;
paddle::operators::ElemwiseExplicitGradCompute<
platform::CPUDeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
*context.x86_execution_context(), skip, skip, skip, dout, param.axis,
&dx, &dy, SubGradDX<T>(), SubGradDY<T>());
}
virtual ~ElementwiseSubGradCompute() = default;
};
template <typename T>
class ElementwiseAddCompute
: public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<X86Context>();
CHECK(context.x86_device_context());
param.Out->template mutable_data<T>();
paddle::operators::ElementwiseComputeEx<AddFunctor<T>,
platform::CPUDeviceContext, T>(
*context.x86_execution_context(), &param.X->raw_tensor(),
&param.Y->raw_tensor(), param.axis, AddFunctor<T>(),
&param.Out->raw_tensor());
}
virtual ~ElementwiseAddCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
// float
REGISTER_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, REGISTER_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW,
paddle::lite::kernels::x86::ElementwiseSubCompute<float>, paddle::lite::kernels::x86::ElementwiseSubCompute<float>,
def) def)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T>
struct SubFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
};
template <typename T>
struct AddFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
};
template <typename T>
class ElementwiseSubCompute
: public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<X86Context>();
CHECK(context.x86_device_context());
param.Out->template mutable_data<T>();
paddle::operators::ElementwiseComputeEx<SubFunctor<T>,
platform::CPUDeviceContext, T>(
*context.x86_execution_context(), &param.X->raw_tensor(),
&param.Y->raw_tensor(), param.axis, SubFunctor<T>(),
&param.Out->raw_tensor());
}
virtual ~ElementwiseSubCompute() = default;
};
template <typename T>
struct SubGradDX {
T operator()(T x, T y, T out, T dout) const { return dout; }
};
template <typename T>
struct SubGradDY {
T operator()(T x, T y, T out, T dout) const { return -dout; }
};
template <typename T>
class ElementwiseSubGradCompute
: public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseGradParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<X86Context>();
CHECK(context.x86_device_context());
param.X_grad->template mutable_data<T>();
param.Y_grad->template mutable_data<T>();
// skip out, x, y
auto dout = param.Out_grad->raw_tensor();
auto dx = param.X_grad->raw_tensor();
auto dy = param.Y_grad->raw_tensor();
auto& skip = dout;
paddle::operators::ElemwiseExplicitGradCompute<
platform::CPUDeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
*context.x86_execution_context(), skip, skip, skip, dout, param.axis,
&dx, &dy, SubGradDX<T>(), SubGradDY<T>());
}
virtual ~ElementwiseSubGradCompute() = default;
};
template <typename T>
class ElementwiseAddCompute
: public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ElementwiseParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
auto& context = ctx_->As<X86Context>();
CHECK(context.x86_device_context());
param.Out->template mutable_data<T>();
paddle::operators::ElementwiseComputeEx<AddFunctor<T>,
platform::CPUDeviceContext, T>(
*context.x86_execution_context(), &param.X->raw_tensor(),
&param.Y->raw_tensor(), param.axis, AddFunctor<T>(),
&param.Out->raw_tensor());
}
virtual ~ElementwiseAddCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/elementwise_compute.h"
#include <gtest/gtest.h>
#include <iostream>
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
TEST(elementwise_add_x86, retrive_op) {
auto elementwise_add =
KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
"elementwise_add");
ASSERT_FALSE(elementwise_add.empty());
ASSERT_TRUE(elementwise_add.front());
}
TEST(elementwise_add_x86, init) {
ElementwiseAddCompute<float> elementwise_add;
ASSERT_EQ(elementwise_add.precision(), PRECISION(kFloat));
ASSERT_EQ(elementwise_add.target(), TARGET(kX86));
}
TEST(elementwise_add_x86, run_test) {
lite::Tensor x, y, out;
constexpr int batch_size = 1;
std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
x.Resize(lite::DDim(x_shape));
std::vector<int64_t> y_shape{batch_size, 3, 2, 2};
y.Resize(lite::DDim(y_shape));
std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
out.Resize(lite::DDim(out_shape));
auto x_data = x.mutable_data<float>();
auto y_data = y.mutable_data<float>();
auto out_data = out.mutable_data<float>();
for (int64_t i = 0; i < x.dims().production(); i++) {
x_data[i] = 1;
}
for (int64_t i = 0; i < y.dims().production(); i++) {
y_data[i] = 2;
}
// ElementwiseAddCompute elementwise_add;
ElementwiseAddCompute<float> elementwise_add;
operators::ElementwiseParam param;
param.X = &x;
param.Y = &y;
param.Out = &out;
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<X86Context>();
elementwise_add.SetParam(param);
elementwise_add.SetContext(std::move(ctx));
elementwise_add.Run();
LOG(INFO) << "output: ";
for (int i = 0; i < out.dims().production(); i++) {
LOG(INFO) << out_data[i];
}
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
...@@ -12,89 +12,7 @@ ...@@ -12,89 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <Eigen/Core> #include "paddle/fluid/lite/kernels/x86/fc_compute.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_lite.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/type_system.h"
#include "paddle/fluid/lite/operators/fc_op.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T>
void fc_compute_eigen(const T* x, int x_h, int x_w, //
const T* w, int w_h, int w_w, //
const T* b, //
T* out) {
using matrix_t =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
Eigen::Map<const matrix_t> X(x, x_h, x_w);
Eigen::Map<const matrix_t> W(w, w_h, w_w);
Eigen::Map<matrix_t> Out(out, x_h, w_w);
Out = X * W;
if (b) {
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
Out = Out.array().rowwise() + B.transpose().array();
}
}
template <typename T>
void fc_compute_naive(const T* x, int x_h, int x_w, //
const T* w, int w_h, int w_w, //
const T* b, //
T* out) {
CHECK_EQ(x_w, w_h);
// out shape: (x_h, w_w)
memset(out, 0, x_h * w_w * sizeof(T));
for (int i = 0; i < x_h; i++) {
for (int j = 0; j < w_w; j++) {
T tmp = static_cast<T>(0);
for (int k = 0; k < x_w; k++) {
tmp += x[i * x_w + k] * w[k * w_w + j];
}
out[i * w_w + j] = tmp + b[j];
}
}
}
template <typename T>
class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::FcParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
CHECK_GE(param.input->dims().size(), 2UL);
CHECK_EQ(param.output->dims().size(), 2UL);
fc_compute_eigen(
param.input->data<T>(), // x
param.input->dims().Slice(0, param.in_num_col_dims).production(),
param.input->dims()
.Slice(param.in_num_col_dims, param.input->dims().size())
.production(),
param.w->data<T>(), // w
param.w->dims()[0], // w_h
param.w->dims()[1], // w_w
param.bias->data<T>(), // b
param.output->mutable_data<T>());
}
virtual ~FcCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(fc, kX86, kFloat, kNCHW, REGISTER_LITE_KERNEL(fc, kX86, kFloat, kNCHW,
paddle::lite::kernels::x86::FcCompute<float>, def) paddle::lite::kernels::x86::FcCompute<float>, def)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <Eigen/Core>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_lite.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/type_system.h"
#include "paddle/fluid/lite/operators/fc_op.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T>
void fc_compute_eigen(const T* x, int x_h, int x_w, //
const T* w, int w_h, int w_w, //
const T* b, //
T* out) {
using matrix_t =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
Eigen::Map<const matrix_t> X(x, x_h, x_w);
Eigen::Map<const matrix_t> W(w, w_h, w_w);
Eigen::Map<matrix_t> Out(out, x_h, w_w);
Out = X * W;
if (b) {
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
Out = Out.array().rowwise() + B.transpose().array();
}
}
template <typename T>
void fc_compute_naive(const T* x, int x_h, int x_w, //
const T* w, int w_h, int w_w, //
const T* b, //
T* out) {
CHECK_EQ(x_w, w_h);
// out shape: (x_h, w_w)
memset(out, 0, x_h * w_w * sizeof(T));
for (int i = 0; i < x_h; i++) {
for (int j = 0; j < w_w; j++) {
T tmp = static_cast<T>(0);
for (int k = 0; k < x_w; k++) {
tmp += x[i * x_w + k] * w[k * w_w + j];
}
out[i * w_w + j] = tmp + b[j];
}
}
}
template <typename T>
class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::FcParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
CHECK_GE(param.input->dims().size(), 2UL);
CHECK_EQ(param.output->dims().size(), 2UL);
fc_compute_eigen(
param.input->data<T>(), // x
param.input->dims().Slice(0, param.in_num_col_dims).production(),
param.input->dims()
.Slice(param.in_num_col_dims, param.input->dims().size())
.production(),
param.w->data<T>(), // w
param.w->dims()[0], // w_h
param.w->dims()[1], // w_w
param.bias->data<T>(), // b
param.output->mutable_data<T>());
}
virtual ~FcCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/fc_compute.h"
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
TEST(fc_x86, retrive_op) {
auto fc =
KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("fc");
ASSERT_FALSE(fc.empty());
ASSERT_TRUE(fc.front());
}
TEST(fc_x86, init) {
FcCompute<float> fc;
ASSERT_EQ(fc.precision(), PRECISION(kFloat));
ASSERT_EQ(fc.target(), TARGET(kX86));
}
TEST(fc_x86, run_test) {
lite::Tensor x, w, b, out;
constexpr int batch_size = 2;
std::vector<int64_t> x_shape{batch_size, 3};
x.Resize(lite::DDim(x_shape));
std::vector<int64_t> w_shape{3, 4};
w.Resize(lite::DDim(w_shape));
std::vector<int64_t> b_shape{1, 4};
b.Resize(lite::DDim(b_shape));
std::vector<int64_t> out_shape{1, 4};
out.Resize(lite::DDim(out_shape));
auto x_data = x.mutable_data<float>();
auto w_data = w.mutable_data<float>();
auto b_data = b.mutable_data<float>();
auto out_data = out.mutable_data<float>();
for (int64_t i = 0; i < x.dims().production(); i++) {
x_data[i] = static_cast<float>(i);
}
for (int64_t i = 0; i < w.dims().production(); i++) {
w_data[i] = static_cast<float>(i);
}
for (int64_t i = 0; i < b.dims().production(); i++) {
b_data[i] = static_cast<float>(i);
}
/* lite::x86::math::fc_compute_eigen(x_data, batch_size, 3, //
w_data, 3, 4, //
b_data, ref_data); */
// FcCompute fc;
FcCompute<float> fc;
operators::FcParam param;
param.in_num_col_dims = 1;
param.input = &x;
param.w = &w;
param.bias = &b;
param.output = &out;
param.in_mat_dims = x.dims();
// std::unique_ptr<KernelContext> ctx(new KernelContext);
// ctx->As<X86Context>();
fc.SetParam(param);
// fc.SetContext(std::move(ctx));
fc.Run();
VLOG(3) << "output vs ref";
for (int i = 0; i < out.dims().production(); i++) {
VLOG(3) << out_data[i];
}
/* for (int i = 0; i < out.dims().product(); ++i) {
EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
}*/
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
...@@ -12,122 +12,7 @@ ...@@ -12,122 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/core/kernel.h" #include "paddle/fluid/lite/kernels/x86/mul_compute.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
using Tensor = framework::Tensor;
template <typename T>
class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::MulParam;
void Run() override {
auto& context = ctx_->As<X86Context>();
auto& param = *param_.get_mutable<operators::MulParam>();
CHECK(context.x86_device_context());
param.output->template mutable_data<T>();
auto* x = &param.x->raw_tensor();
auto* y = &param.y->raw_tensor();
const Tensor x_matrix = x->dims().size() > 2 ? framework::ReshapeToMatrix(
*x, param.x_num_col_dims)
: *x;
const Tensor y_matrix = y->dims().size() > 2 ? framework::ReshapeToMatrix(
*y, param.y_num_col_dims)
: *y;
auto* z = &param.output->raw_tensor();
auto z_dim = z->dims();
if (z_dim.size() != 2) {
z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
*context.x86_device_context());
blas.MatMul(x_matrix, y_matrix, z);
if (z_dim.size() != 2) {
z->Resize(z_dim);
}
}
virtual ~MulCompute() = default;
};
template <typename T>
class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
void Run() override {
auto& context = ctx_->As<X86Context>();
auto& param = *param_.get_mutable<operators::MulGradParam>();
CHECK(context.x86_device_context());
auto* x = &param.x->raw_tensor();
auto* y = &param.y->raw_tensor();
auto x_matrix = x->dims().size() > 2
? framework::ReshapeToMatrix(*x, param.x_num_col_dims)
: static_cast<const Tensor&>(*x);
auto y_matrix = y->dims().size() > 2
? framework::ReshapeToMatrix(*y, param.y_num_col_dims)
: static_cast<const Tensor&>(*y);
auto* dout = &param.output_grad->raw_tensor();
Tensor dout_mat;
dout_mat.ShareDataWith(*dout);
dout_mat.Resize(
{framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
auto* dx = &param.x_grad->raw_tensor();
auto* dy = &param.y_grad->raw_tensor();
if (dx != nullptr) {
dx->set_lod(x->lod());
}
if (dy != nullptr) {
dy->set_lod(y->lod());
}
auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
*context.x86_device_context());
if (dx) {
// dx->mutable_data<T>(context.x86_device_context->GetPlace());
param.x_grad->template mutable_data<T>();
Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
*dx, param.x_num_col_dims)
: *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
}
if (dy) {
// dy->yutable_data<T>(context.x86_device_context->GetPlace());
param.y_grad->template mutable_data<T>();
Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
*dy, param.y_num_col_dims)
: *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
}
}
virtual ~MulGradCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(mul, kX86, kFloat, kNCHW, REGISTER_LITE_KERNEL(mul, kX86, kFloat, kNCHW,
paddle::lite::kernels::x86::MulCompute<float>, def) paddle::lite::kernels::x86::MulCompute<float>, def)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
using Tensor = framework::Tensor;
template <typename T>
class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::MulParam;
void Run() override {
auto& context = ctx_->As<X86Context>();
auto& param = *param_.get_mutable<operators::MulParam>();
CHECK(context.x86_device_context());
param.output->template mutable_data<T>();
auto* x = &param.x->raw_tensor();
auto* y = &param.y->raw_tensor();
Tensor x_matrix, y_matrix;
if (x->dims().size() > 2) {
x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims);
} else {
x_matrix = *x;
}
if (y->dims().size() > 2) {
y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims);
} else {
y_matrix = *y;
}
auto* z = &param.output->raw_tensor();
auto z_dim = z->dims();
if (z_dim.size() != 2) {
z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
}
auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
*context.x86_device_context());
blas.MatMul(x_matrix, y_matrix, z);
if (z_dim.size() != 2) {
z->Resize(z_dim);
}
}
virtual ~MulCompute() = default;
};
template <typename T>
class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
void Run() override {
auto& context = ctx_->As<X86Context>();
auto& param = *param_.get_mutable<operators::MulGradParam>();
CHECK(context.x86_device_context());
auto* x = &param.x->raw_tensor();
auto* y = &param.y->raw_tensor();
Tensor x_matrix, y_matrix;
if (x->dims().size() > 2) {
x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims);
} else {
x_matrix = *x;
}
if (y->dims().size() > 2) {
y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims);
} else {
y_matrix = *y;
}
auto* dout = &param.output_grad->raw_tensor();
Tensor dout_mat;
dout_mat.ShareDataWith(*dout);
dout_mat.Resize(
{framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
auto* dx = &param.x_grad->raw_tensor();
auto* dy = &param.y_grad->raw_tensor();
if (dx != nullptr) {
dx->set_lod(x->lod());
}
if (dy != nullptr) {
dy->set_lod(y->lod());
}
auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
*context.x86_device_context());
if (dx) {
// dx->mutable_data<T>(context.x86_device_context->GetPlace());
param.x_grad->template mutable_data<T>();
Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
*dx, param.x_num_col_dims)
: *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
}
if (dy) {
// dy->yutable_data<T>(context.x86_device_context->GetPlace());
param.y_grad->template mutable_data<T>();
Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
*dy, param.y_num_col_dims)
: *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
}
}
virtual ~MulGradCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/x86/mul_compute.h"
#include <gtest/gtest.h>
#include <iostream>
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
TEST(mul_x86, retrive_op) {
auto mul =
KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("mul");
ASSERT_FALSE(mul.empty());
ASSERT_TRUE(mul.front());
}
TEST(mul_x86, init) {
MulCompute<float> mul;
ASSERT_EQ(mul.precision(), PRECISION(kFloat));
ASSERT_EQ(mul.target(), TARGET(kX86));
}
TEST(mul_x86, run_test) {
lite::Tensor x, y, out;
constexpr int batch_size = 1;
std::vector<int64_t> x_shape{batch_size, 3};
x.Resize(lite::DDim(x_shape));
std::vector<int64_t> y_shape{3, 4};
y.Resize(lite::DDim(y_shape));
std::vector<int64_t> out_shape{batch_size, 4};
out.Resize(lite::DDim(out_shape));
auto x_data = x.mutable_data<float>();
auto y_data = y.mutable_data<float>();
auto out_data = out.mutable_data<float>();
for (int64_t i = 0; i < x.dims().production(); i++) {
x_data[i] = static_cast<float>(i);
}
for (int64_t i = 0; i < y.dims().production(); i++) {
y_data[i] = static_cast<float>(i);
}
// MulCompute mul;
MulCompute<float> mul;
operators::MulParam param;
param.x = &x;
param.y = &y;
param.output = &out;
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<X86Context>();
mul.SetContext(std::move(ctx));
mul.SetParam(param);
mul.Run();
LOG(INFO) << "output: ";
for (int i = 0; i < out.dims().production(); i++) {
LOG(INFO) << out_data[i];
}
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
...@@ -12,69 +12,10 @@ ...@@ -12,69 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <Eigen/Core> #include "paddle/fluid/lite/kernels/x86/pool_compute.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/pooling.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T>
class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::PoolParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
if (param.global_pooling) {
for (size_t i = 0; i < param.ksize.size(); ++i) {
param.paddings[i] = 0;
param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
}
}
switch (param.ksize.size()) {
case 2: {
if (param.pooling_type == "max") {
paddle::operators::math::Pool2dFunctor<
platform::CPUDeviceContext, paddle::operators::math::MaxPool<T>,
T>
pool2d_forward;
paddle::operators::math::MaxPool<T> pool_process;
pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
param.ksize, param.strides, param.paddings,
pool_process, true, false,
&(param.output->raw_tensor()));
} else if (param.pooling_type == "avg") {
paddle::operators::math::Pool2dFunctor<
platform::CPUDeviceContext, paddle::operators::math::AvgPool<T>,
T>
pool2d_forward;
paddle::operators::math::AvgPool<T> pool_process;
pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
param.ksize, param.strides, param.paddings,
pool_process, param.exclusive, param.adaptive,
&(param.output->raw_tensor()));
}
} break;
case 3: {
} break;
}
}
virtual ~PoolCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, REGISTER_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW,
paddle::lite::kernels::x86::PoolCompute<float>, def) paddle::lite::kernels::x86::PoolCompute<float>, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) .BindInput("x", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize(); .Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <Eigen/Core>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/pooling.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
template <typename T>
class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::PoolParam;
void Run() override {
auto& param = *param_.get_mutable<param_t>();
if (param.global_pooling) {
for (size_t i = 0; i < param.ksize.size(); ++i) {
param.paddings[i] = 0;
param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
}
}
switch (param.ksize.size()) {
case 2: {
if (param.pooling_type == "max") {
paddle::operators::math::Pool2dFunctor<
platform::CPUDeviceContext, paddle::operators::math::MaxPool<T>,
T>
pool2d_forward;
paddle::operators::math::MaxPool<T> pool_process;
pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
param.ksize, param.strides, param.paddings,
pool_process, true, false,
&(param.output->raw_tensor()));
} else if (param.pooling_type == "avg") {
paddle::operators::math::Pool2dFunctor<
platform::CPUDeviceContext, paddle::operators::math::AvgPool<T>,
T>
pool2d_forward;
paddle::operators::math::AvgPool<T> pool_process;
pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
param.ksize, param.strides, param.paddings,
pool_process, param.exclusive, param.adaptive,
&(param.output->raw_tensor()));
}
} break;
case 3: {
} break;
}
}
virtual ~PoolCompute() = default;
};
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册