diff --git a/CMakeLists.txt b/CMakeLists.txt index b2481912232cbca95999994417d7f30e98cd4f26..ed3c390066dfac2322d802c6039bc7155a36e38a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) +option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -129,6 +130,10 @@ if(WITH_GPU) endif(NOT WITH_DSO) endif(WITH_GPU) +if(USE_NNPACK) + list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt") +endif(USE_NNPACK) + add_subdirectory(proto) # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index 1c39ced3c9e3da4079a66e29c00be9cc18411b68..1518a8a654cfb54376a49760dc5873733c916937 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -10,6 +10,14 @@ if(WITH_GPU) cuda_compile(cu_objs ${cu_files}) endif() +if(USE_NNPACK) + include(nnpack/nnpack.cmake) + list(APPEND cpp_files nnpack/NNPACKConvOp.cpp) + if(WITH_TESTING) + add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp) + endif() +endif() + add_library(paddle_function STATIC ${cpp_files} ${cu_objs}) add_dependencies(paddle_function ${external_project_dependencies}) add_dependencies(paddle_function paddle_proto) diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e8080c3d714b324f072a380f738b9764477dfe04 --- /dev/null +++ b/paddle/function/nnpack/NNPACKConvOp.cpp @@ -0,0 +1,238 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "nnpack.h" +#include "paddle/function/ConvOp.h" + +DEFINE_bool(nnpack_allocate_outside, + false, + "Allocate and free workspace memory outside the NNPACK interface."); +DEFINE_int32(nnpack_num_threads, + 0, + "The number of nnpack threads" + "default: 0; 0 to disable threadpool."); + +namespace paddle { + +nnp_convolution_algorithm get_nnp_convolution_algorithm( + const std::string& algorithm) { + if (algorithm == "auto") { + return nnp_convolution_algorithm_auto; + } else if (algorithm == "ft8x8") { + return nnp_convolution_algorithm_ft8x8; + } else if (algorithm == "ft16x16") { + return nnp_convolution_algorithm_ft16x16; + } else if (algorithm == "wt8x8") { + return nnp_convolution_algorithm_wt8x8; + } else if (algorithm == "implicit-gemm") { + return nnp_convolution_algorithm_implicit_gemm; + } else if (algorithm == "direct") { + return nnp_convolution_algorithm_direct; + } else { + return nnp_convolution_algorithm_auto; + } +} + +template <DeviceType Device> +class NNPACKConvFunction : public ConvFunctionBase { +public: + void init(const FuncConfig& config) override { + ConvFunctionBase::init(config); + CHECK_EQ(groups_, (size_t)1); + algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo")); + // algorithm_ = nnp_convolution_algorithm_auto; + transform_strategy_ = nnp_convolution_transform_strategy_compute; + nnp_status status = nnp_initialize(); + CHECK_EQ(status, nnp_status_success); + workspaceBuffer_ = nullptr; + workspaceSize_ = 0; + + threadpool_ = nullptr; + if (FLAGS_nnpack_num_threads) { + threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads); + VLOG(3) << "Number of threads " + << pthreadpool_get_threads_count(threadpool_); + } + } + + ~NNPACKConvFunction() { + if (threadpool_) { + pthreadpool_destroy(threadpool_); + } + if (workspaceBuffer_) { + free(workspaceBuffer_); + } + } + + virtual void check(const BufferArgs& inputs, + const BufferArgs& outputs) override { + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + checkShape(input, filter, output); + } + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + CHECK_EQ(numInputs_, inputs.size()); + CHECK_EQ(numOutputs_, outputs.size()); + CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO); + check(inputs, outputs); + const TensorShape& input = inputs[0].shape(); + const TensorShape& filter = inputs[1].shape(); + const TensorShape& output = outputs[0].shape(); + + size_t batchSize = input[0]; + size_t inputChannels = input[1]; + size_t inputHeight = input[2]; + size_t inputWidth = input[3]; + size_t filterHeight = getFilterHeight(filter); + size_t filterWidth = getFilterWidth(filter); + size_t outputChannels = output[1]; + // size_t outputHeight = output[2]; + // size_t outputWidth = output[3]; + + nnp_size inputSize = {.width = inputWidth, .height = inputHeight}; + nnp_padding padding = {.top = (size_t)paddingH(), + .right = (size_t)paddingW(), + .bottom = (size_t)paddingH(), + .left = (size_t)paddingW()}; + nnp_size kernelSize = {.width = filterWidth, .height = filterHeight}; + nnp_size outputSubsampling = {.width = (size_t)strideW(), + .height = (size_t)strideH()}; + + float* inputData = inputs[0].data<float>(); + float* filterData = inputs[1].data<float>(); + float* outputData = outputs[0].data<float>(); + + void* bufferPtr = nullptr; + size_t* sizePtr = nullptr; + size_t needSize; + if (FLAGS_nnpack_allocate_outside) { + if (batchSize == 1) { + nnp_status status = nnp_convolution_inference(algorithm_, + transform_strategy_, + inputChannels, + outputChannels, + inputSize, + padding, + kernelSize, + outputSubsampling, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + &needSize, + nnp_activation_identity, + nullptr, + nullptr, + nullptr); + CHECK_EQ(status, nnp_status_success); + } else { + // only supports stride = 1 + CHECK_EQ(strideH(), 1); + CHECK_EQ(strideW(), 1); + nnp_status status = nnp_convolution_output(algorithm_, + batchSize, + inputChannels, + outputChannels, + inputSize, + padding, + kernelSize, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + &needSize, + nnp_activation_identity, + nullptr, + nullptr, + nullptr); + CHECK_EQ(status, nnp_status_success); + } + + VLOG(3) << "workspace size is " << needSize; + if (needSize > workspaceSize_) { + workspaceSize_ = needSize; + if (workspaceBuffer_) { + free(workspaceBuffer_); + } else { + posix_memalign(&workspaceBuffer_, 64, needSize); + } + } + + if (needSize) { + bufferPtr = workspaceBuffer_; + sizePtr = &needSize; + } + } + + if (batchSize == 1) { + nnp_status status = + nnp_convolution_inference(algorithm_, + transform_strategy_, + inputChannels, + outputChannels, + inputSize, + padding, + kernelSize, + outputSubsampling, + inputData, + filterData, + nullptr, /* bias */ + outputData, + bufferPtr, + sizePtr, + nnp_activation_identity, + nullptr, + threadpool_, /* threadpool */ + nullptr); + CHECK_EQ(status, nnp_status_success); + } else { + // only supports stride = 1 + CHECK_EQ(strideH(), 1); + CHECK_EQ(strideW(), 1); + nnp_status status = nnp_convolution_output(algorithm_, + batchSize, + inputChannels, + outputChannels, + inputSize, + padding, + kernelSize, + inputData, + filterData, + nullptr, /* bias */ + outputData, + bufferPtr, + sizePtr, + nnp_activation_identity, + nullptr, + threadpool_, /* threadpool */ + nullptr); + CHECK_EQ(status, nnp_status_success); + } + } + +private: + nnp_convolution_algorithm algorithm_; + nnp_convolution_transform_strategy transform_strategy_; + void* workspaceBuffer_; + size_t workspaceSize_; + pthreadpool_t threadpool_; +}; + +REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction); + +} // namespace paddle diff --git a/paddle/function/nnpack/NNPACKConvOpTest.cpp b/paddle/function/nnpack/NNPACKConvOpTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..48180112111c67f36ddd425008187201655089c9 --- /dev/null +++ b/paddle/function/nnpack/NNPACKConvOpTest.cpp @@ -0,0 +1,99 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include <gtest/gtest.h> +#include "paddle/function/Function.h" +#include "paddle/function/FunctionTest.h" + +DEFINE_string(algo, + "auto", + "The algorithm (auto, ft8x8, ft16x16, wt8x8, " + "implicit-gemm, or direct) for computing convolution of NNPACK."); + +namespace paddle { + +#define IS_NNPACK_SUPPORT(algo, filterSize, stride) \ + if (algo == "direct" && filterSize != 1) continue; \ + if (algo == "direct" && batchSize != 1) continue; \ + if (algo == "wt8x8" && filterSize != 3) continue; \ + if (algo == "implicit-gemm" && batchSize != 1) continue; \ + if (algo != "auto" && algo != "implicit-gemm" && stride > 1) continue; + +class ConvolutionTest { +public: + ConvolutionTest(const std::string& conv1, + const std::string& conv2, + std::string algo = "auto") { + for (size_t batchSize : {1, 32}) { + for (size_t inputSize : {7, 14, 54}) { + for (size_t filterSize : {1, 3, 5}) { + for (size_t inputChannels : {3, 64}) { + for (size_t outputChannels : {3, 64, 128}) { + if (inputChannels < outputChannels) break; + for (size_t stride : {1, 2}) { + // if batchSize > 1 NNPACKConv only supports stride = 1 + if (batchSize > 1 && stride > 1) break; + for (size_t padding : {0, 1}) { + if (padding >= filterSize) break; + size_t outputSize = + (inputSize - filterSize + 2 * padding + stride) / stride; + IS_NNPACK_SUPPORT(algo, filterSize, stride); + LOG(INFO) << " batchSize=" << batchSize + << " inputChannels=" << inputChannels + << " inputHeight=" << inputSize + << " inputWidth=" << inputSize + << " outputChannels=" << outputChannels + << " filterHeight=" << filterSize + << " filterWidth=" << filterSize + << " outputHeight=" << outputSize + << " outputWidth=" << outputSize + << " stride=" << stride << " padding=" << padding; + + std::vector<size_t> paddings = {padding, padding}; + std::vector<size_t> strides = {stride, stride}; + Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test( + conv1, + conv2, + FuncConfig() + .set("paddings", paddings) + .set("strides", strides) + .set("groups", (size_t)1) + .set("algo", algo)); + + TensorShape shape0{ + batchSize, inputChannels, inputSize, inputSize}; + TensorShape shape1{ + outputChannels, inputChannels, filterSize, filterSize}; + TensorShape shape2{ + batchSize, outputChannels, outputSize, outputSize}; + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape0)); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape1)); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape2)); + test.run(); + } + } + } + } + } + } + } + } +}; + +TEST(Convolution, NNPACK) { + // NNPACK only supports stride = 1 + ConvolutionTest test("GemmConv-CPU", "NNPACKConv-CPU", FLAGS_algo); +} + +} // namespace paddle diff --git a/paddle/function/nnpack/nnpack.cmake b/paddle/function/nnpack/nnpack.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7182730ae8f133bdc4f73bfc46fa8acbe5f3b603 --- /dev/null +++ b/paddle/function/nnpack/nnpack.cmake @@ -0,0 +1,16 @@ +# Find the NNPACK library +# NNPACK_ROOT - where to find NNPACK include and library. +# + +set(NNPACK_FOUND OFF) +set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK") +find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include) +find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib) +find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib) + +if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB) + set(NNPACK_FOUND ON) + INCLUDE_DIRECTORIES(${NNPACK_INC_DIR}) +else() + message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})") +endif() diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp index 914689e66cdb8947e886e17e75829183c1af1a42..af79e65a7c09e5a1b55febf1df1e8f5bb61bdcb8 100644 --- a/paddle/gserver/layers/ExpandConvLayer.cpp +++ b/paddle/gserver/layers/ExpandConvLayer.cpp @@ -16,6 +16,10 @@ limitations under the License. */ #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" +DEFINE_bool(use_nnpack, + false, + "Whether to use nnpack for convolution calculation."); + namespace paddle { /* @@ -37,26 +41,38 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, for (int i = 0; i < config_.inputs_size(); i++) { std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]}; std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]}; - createFunction(forward_, - !isDeconv_ ? "GemmConv" : "GemmConvGradInput", - FuncConfig() - .set("paddings", paddings) - .set("strides", strides) - .set("groups", (size_t)groups_[i])); - - createFunction(backward_, - !isDeconv_ ? "GemmConvGradInput" : "GemmConv", - FuncConfig() - .set("paddings", paddings) - .set("strides", strides) - .set("groups", (size_t)groups_[i])); - - createFunction(backward_, - "GemmConvGradFilter", - FuncConfig() - .set("paddings", paddings) - .set("strides", strides) - .set("groups", (size_t)groups_[i])); + + if (FLAGS_use_nnpack) { + CHECK_EQ(isDeconv_, false); + createFunction(forward_, + "NNPACKConv", + FuncConfig() + .set("paddings", paddings) + .set("strides", strides) + .set("groups", (size_t)groups_[i]) + .set("algo", std::string("auto"))); + } else { + createFunction(forward_, + !isDeconv_ ? "GemmConv" : "GemmConvGradInput", + FuncConfig() + .set("paddings", paddings) + .set("strides", strides) + .set("groups", (size_t)groups_[i])); + + createFunction(backward_, + !isDeconv_ ? "GemmConvGradInput" : "GemmConv", + FuncConfig() + .set("paddings", paddings) + .set("strides", strides) + .set("groups", (size_t)groups_[i])); + + createFunction(backward_, + "GemmConvGradFilter", + FuncConfig() + .set("paddings", paddings) + .set("strides", strides) + .set("groups", (size_t)groups_[i])); + } } return true; }