diff --git a/CMakeLists.txt b/CMakeLists.txt index 32b369bec58f4b5ba35813e56023cd3b9a8b0a05..fd552698a837dc02f48d4c20ba7b802d6b4d6602 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) +option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) @@ -108,6 +109,8 @@ if(ANDROID OR IOS) "Disable RDMA when cross-compiling for Android and iOS" FORCE) set(WITH_MKL OFF CACHE STRING "Disable MKL when cross-compiling for Android and iOS" FORCE) + set(WITH_NGRAPH OFF CACHE STRING + "Disable nGraph when cross-compiling for Android and iOS" FORCE) set(WITH_GOLANG OFF CACHE STRING "Disable golang when cross-compiling for Android and iOS" FORCE) @@ -176,6 +179,7 @@ include(external/protobuf) # download, build, install protobuf include(external/python) # download, build, install python include(external/openblas) # download, build, install openblas include(external/mkldnn) # download, build, install mkldnn +include(external/ngraph) # download, build, install nGraph include(external/swig) # download, build, install swig include(external/boost) # download boost include(external/any) # download libn::any diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 9fea9ca05bce921b105a0f092c321b0c3a55c63c..785148d4f9f44032e2ce5bf93f0dc80fc865808b 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -37,7 +37,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. -INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h IF(${CBLAS_PROVIDER} STREQUAL "MKLML") SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake new file mode 100644 index 0000000000000000000000000000000000000000..2e335579f32df4f146c8d88e05e684a9a8105e20 --- /dev/null +++ b/cmake/external/ngraph.cmake @@ -0,0 +1,92 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library(ngraph INTERFACE) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with nGraph in Paddle yet." + "Force WITH_NGRAPH=OFF") + SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph in Windows and MacOS" FORCE) +ENDIF() + +IF(${WITH_NGRAPH} AND NOT ${WITH_MKLDNN}) + MESSAGE(WARNING + "nGraph needs mkl-dnn to be enabled." + "Force WITH_NGRAPH=OFF") + SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph if mkl-dnn is disabled" FORCE) +ENDIF() + +IF(NOT ${WITH_NGRAPH}) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(NGRAPH_PROJECT "extern_ngraph") +SET(NGRAPH_VERSION "0.9") +SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0") +SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) +SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) +SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) +SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) +SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) +SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) +SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") + +ExternalProject_Add( + ${NGRAPH_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT} + GIT_REPOSITORY ${NGRAPH_GIT_REPO} + GIT_TAG ${NGRAPH_GIT_TAG} + PREFIX ${NGRAPH_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR} + CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE + CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} + CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib +) + +if(UNIX AND NOT APPLE) + include(GNUInstallDirs) + SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) +else() + SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib) +endif() +MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}") + +SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) +SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) +SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) + +# Workaround for nGraph expecting mklml to be in mkldnn install directory. +ExternalProject_Add_Step( + ${NGRAPH_PROJECT} + PrepareMKL + COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so + COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so + DEPENDEES download + DEPENDERS configure +) + +add_dependencies(ngraph ${NGRAPH_PROJECT}) +target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH) +target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR}) +target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB}) +LIST(APPEND external_project_dependencies ngraph) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 75ffabca7c2ae831662d8e7a275ecc475f6bd9cb..94d8ac30cc5f70e939ebde23a6f6b9f188ed1a73 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -30,66 +30,61 @@ UNSET_VAR(PROTOBUF_LITE_LIBRARY) UNSET_VAR(PROTOBUF_LIBRARY) UNSET_VAR(PROTOBUF_INCLUDE_DIR) UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) +function(protobuf_generate_python SRCS) + # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() -if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. - function(protobuf_generate_python SRCS) - # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake - if(NOT ARGN) - message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") - return() - endif() - - if(PROTOBUF_GENERATE_CPP_APPEND_PATH) - # Create an include path for each file specified - foreach(FIL ${ARGN}) - get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - get_filename_component(ABS_PATH ${ABS_FIL} PATH) - list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${ABS_PATH}) - endif() - endforeach() - else() - set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) - endif() - - if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) - set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") - endif() - - if(DEFINED Protobuf_IMPORT_DIRS) - foreach(DIR ${Protobuf_IMPORT_DIRS}) - get_filename_component(ABS_PATH ${DIR} ABSOLUTE) - list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) - if(${_contains_already} EQUAL -1) - list(APPEND _protobuf_include_path -I ${ABS_PATH}) - endif() - endforeach() - endif() - - set(${SRCS}) + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified foreach(FIL ${ARGN}) get_filename_component(ABS_FIL ${FIL} ABSOLUTE) - get_filename_component(FIL_WE ${FIL} NAME_WE) - if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) - get_filename_component(FIL_DIR ${FIL} DIRECTORY) - if(FIL_DIR) - set(FIL_WE "${FIL_DIR}/${FIL_WE}") - endif() + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() - list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" - COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} - DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE} - COMMENT "Running Python protocol buffer compiler on ${FIL}" - VERBATIM ) + if(DEFINED Protobuf_IMPORT_DIRS) + foreach(DIR ${Protobuf_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() endforeach() + endif() - set(${SRCS} ${${SRCS}} PARENT_SCOPE) - endfunction() -endif() + set(${SRCS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) + get_filename_component(FIL_DIR ${FIL} DIRECTORY) + if(FIL_DIR) + set(FIL_WE "${FIL_DIR}/${FIL_WE}") + endif() + endif() + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE} + COMMENT "Running Python protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set(${SRCS} ${${SRCS}} PARENT_SCOPE) +endfunction() # Print and set the protobuf library information, # finish this cmake process and exit from this file. @@ -126,6 +121,7 @@ macro(PROMPT_PROTOBUF_LIB) # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. # make `protobuf_generate_cpp` happy. SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) + FOREACH(dep ${protobuf_DEPS}) ADD_DEPENDENCIES(protobuf ${dep}) ADD_DEPENDENCIES(protobuf_lite ${dep}) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 3bc3b3c5e3a491275d7e58c165ea7497cb772e52..f06ef199d165455047a602f7ddec23534b99108e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -5,6 +5,8 @@ list(REMOVE_DUPLICATES GENERAL_OPS) set(DEPS_OPS "") set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h) file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt. DO NOT EDIT!\n\n") + +set(PART_CUDA_KERNEL_FILES) function(op_library TARGET) # op_library is a function to create op library. The interface is same as # cc_library. But it handle split GPU/CPU code and link some common library @@ -37,6 +39,12 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu + ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE) + list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu) list(APPEND hip_cu_srcs ${TARGET}.hip.cu) endif() @@ -330,6 +338,8 @@ foreach(src ${GENERAL_OPS}) endforeach() file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") + + if (NOT WIN32) add_subdirectory(reader) endif(NOT WIN32) @@ -356,3 +366,14 @@ if(NOT WIN32) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) + +if(WITH_GPU) + foreach(CUDA_KERNEL_FILE ${PART_CUDA_KERNEL_FILES}) + file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT) + string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT}) + if (MATCHED) + string(STRIP ${CMAKE_MATCH_1} MATCHED) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n") + endif() + endforeach() +endif() diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h index d74d4db92528d69492ab7b90a98d3775dadc35d1..e4df59c5d51c390cf593add0c5562665c91f33f6 100644 --- a/paddle/fluid/operators/gather.cu.h +++ b/paddle/fluid/operators/gather.cu.h @@ -50,7 +50,9 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { // PADDLE_ENFORCE(platform::is_gpu_place(place)); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); + int index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h index d72e07d76c97e9e455e54980207d7c02842cc04b..dc08ee5efacde5e232d751b13aaf11f51237634a 100644 --- a/paddle/fluid/operators/gather.h +++ b/paddle/fluid/operators/gather.h @@ -38,7 +38,8 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); int64_t index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index f84ff206fffddef1030b7ed439e887bdfef342a6..95aa9b573c795159079bdb5401b34d7a61252115 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -31,7 +31,8 @@ class GatherOp : public framework::OperatorWithKernel { "Output(Out) of GatherOp should not be null."); auto index_dims = ctx->GetInputDim("Index"); - PADDLE_ENFORCE(index_dims.size() == 1); + PADDLE_ENFORCE(index_dims.size() == 1 || + (index_dims.size() == 2 && index_dims[1] == 1)); int batch_size = ctx->GetInputDim("Index")[0]; framework::DDim output_dims(ctx->GetInputDim("X")); output_dims[0] = batch_size; @@ -53,6 +54,7 @@ class GatherGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); } protected: @@ -75,7 +77,7 @@ Gather Operator. $Out = X[Index]$ -Out is obtained by gathering entries of the outer-most dimension +Out is obtained by gathering entries of the outer-most dimension of X indexed by Index and concatenate them together. Example: diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu index 0d86b3127e42f7ee14ba57b1c762e8128a0f2d54..b21da178f3eeaafa41bde5f64cc4abcf7944b032 100644 --- a/paddle/fluid/operators/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_max_op.cu @@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_max, int, ops::MaxFunctor>, ops::ReduceKernel); -REGISTER_OP_CUDA_KERNEL( - reduce_max_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_max_op.part.cu b/paddle/fluid/operators/reduce_max_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..6954c8d744faee6f8f0b715d6e4c8e3bcda7fb83 --- /dev/null +++ b/paddle/fluid/operators/reduce_max_op.part.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_max_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu index 59b30244839849d79e3e531953134633503c4090..4408200d2d052c2f68c2dd35619de6ed67f07f6e 100644 --- a/paddle/fluid/operators/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -69,13 +69,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, ops::ReduceMeanKernel, ops::ReduceMeanKernel, ops::ReduceMeanKernel); - -REGISTER_OP_CUDA_KERNEL( - reduce_mean_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_mean_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..4b663bcdca7c20f8802d962a362f429d8eafe9af --- /dev/null +++ b/paddle/fluid/operators/reduce_mean_op.part.cu @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// .part used to speed up nvcc compile +#include "paddle/fluid/operators/reduce_mean_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_mean_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu index da466f805eff4709dc23471baef03e94052ee6c1..5a04a12b79444dcea30d3c1140d9708a98b55fe3 100644 --- a/paddle/fluid/operators/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_min_op.cu @@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_min, int, ops::MinFunctor>, ops::ReduceKernel); -REGISTER_OP_CUDA_KERNEL( - reduce_min_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_min_op.part.cu b/paddle/fluid/operators/reduce_min_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..5b8f061b2d03eb76863401905ac87044fd5ea778 --- /dev/null +++ b/paddle/fluid/operators/reduce_min_op.part.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_min_max_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_min_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu index d62e677d92cffecf629d1684026b0c7bcfec29e3..d8692afb96e4d5d3206210060684dd12fb4d79a7 100644 --- a/paddle/fluid/operators/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_prod_op.cu @@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod, int, ops::ProdFunctor>, ops::ReduceKernel); -REGISTER_OP_CUDA_KERNEL( - reduce_prod_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_prod_op.part.cu b/paddle/fluid/operators/reduce_prod_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..486c578c64b9a2d80abc940a7c4266ef5fd23c7f --- /dev/null +++ b/paddle/fluid/operators/reduce_prod_op.part.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_prod_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_prod_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu index 53cd9e9419dd9aecee730917ae21d7a4ab332ffc..2b031e8df99768c9208146640bddbe51149b2614 100644 --- a/paddle/fluid/operators/reduce_sum_op.cu +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -64,13 +64,3 @@ class ReduceSumKernel : public framework::OpKernel { REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel, ops::ReduceSumKernel, ops::ReduceSumKernel, ops::ReduceSumKernel); - -REGISTER_OP_CUDA_KERNEL( - reduce_sum_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_sum_op.part.cu new file mode 100644 index 0000000000000000000000000000000000000000..525633f62a95b2d0d677fcbebe551b75cb2a180d --- /dev/null +++ b/paddle/fluid/operators/reduce_sum_op.part.cu @@ -0,0 +1,26 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/cub_reduce.h" +#include "paddle/fluid/operators/reduce_sum_op.h" + +REGISTER_OP_CUDA_KERNEL( + reduce_sum_grad, ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel); diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h index ac7d69bfb549fd98c76fcf834e8d3ad9bec2ef23..b2e79f6c82bb748293f4219845e6798347c8c46e 100644 --- a/paddle/fluid/operators/scatter.cu.h +++ b/paddle/fluid/operators/scatter.cu.h @@ -51,7 +51,8 @@ void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { // PADDLE_ENFORCE(platform::is_gpu_place(place)); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); int index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h index 39af717615c01f5c121e32b176b74d05be738531..8bae6606c94620ab4fa8ae34f69236e7e87e9670 100644 --- a/paddle/fluid/operators/scatter.h +++ b/paddle/fluid/operators/scatter.h @@ -37,7 +37,8 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1); + PADDLE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1)); int index_size = index.dims()[0]; auto src_dims = src.dims(); diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc index 3f4b48bc7391def082c82ed451fc5a752009a2f1..9345b495415d203728238c19621a20f446c40bf5 100644 --- a/paddle/fluid/operators/stack_op.cc +++ b/paddle/fluid/operators/stack_op.cc @@ -21,8 +21,12 @@ REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker, REGISTER_OPERATOR(stack_grad, ops::StackOpGrad); REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel, - ops::StackKernel); + ops::StackKernel, + ops::StackKernel, + ops::StackKernel); REGISTER_OP_CPU_KERNEL(stack_grad, ops::StackGradKernel, - ops::StackGradKernel); + ops::StackGradKernel, + ops::StackGradKernel, + ops::StackGradKernel); diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu index 92c1bde2bcf089e5c715e90e564408e6ad37ba17..bf2a9e5b3d22996e688621727cb280dc9aed7859 100644 --- a/paddle/fluid/operators/stack_op.cu +++ b/paddle/fluid/operators/stack_op.cu @@ -18,8 +18,12 @@ namespace plat = paddle::platform; namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel, - ops::StackKernel); + ops::StackKernel, + ops::StackKernel, + ops::StackKernel); REGISTER_OP_CUDA_KERNEL(stack_grad, ops::StackGradKernel, - ops::StackGradKernel); + ops::StackGradKernel, + ops::StackGradKernel, + ops::StackGradKernel); diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 595af0c29a5c2c30427a82205d3d56c49492c0c8..a51c9becd416af243cb473c8856141db8d9f3bf0 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -139,6 +139,7 @@ function cmake_gen() { -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} -DWITH_MKL=${WITH_MKL:-ON} + -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} -DWITH_AVX=${WITH_AVX:-OFF} -DWITH_GOLANG=${WITH_GOLANG:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} @@ -171,6 +172,7 @@ EOF -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ -DWITH_MKL=${WITH_MKL:-ON} \ + -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \ -DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_GOLANG=${WITH_GOLANG:-OFF} \ -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \ diff --git a/python/setup.py.in b/python/setup.py.in index 48db2420b4428cca134a4b1736f042337709e58a..26f3c5aeaa17f60ccdac849e589c779a73138088 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -179,6 +179,18 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': raise Exception("patch libmkldnn.so failed, command: %s" % command) package_data['paddle.libs']+=['libmkldnn.so.0'] shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) +if '${WITH_NGRAPH}' == 'ON': + if '${CMAKE_BUILD_TYPE}' == 'Release': + # only change rpath in Release mode. + command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}" + if os.system(command) != 0: + raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command) + shutil.copy('${NGRAPH_SHARED_LIB}', libs_path) + shutil.copy('${NGRAPH_CPU_LIB}', libs_path) + shutil.copy('${NGRAPH_TBB_LIB}', libs_path) + package_data['paddle.libs']+=['${NGRAPH_SHARED_LIB_NAME}', + '${NGRAPH_CPU_LIB_NAME}', + '${NGRAPH_TBB_LIB_NAME}'] # remove unused paddle/libs/__init__.py if os.path.isfile(libs_path+'/__init__.py'): os.remove(libs_path+'/__init__.py')