提交 61fa5218 编写于 作者: P peizhilin

Merge remote-tracking branch 'upstream/develop' into windows/build

...@@ -46,6 +46,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F ...@@ -46,6 +46,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF) option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF)
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
...@@ -108,6 +109,8 @@ if(ANDROID OR IOS) ...@@ -108,6 +109,8 @@ if(ANDROID OR IOS)
"Disable RDMA when cross-compiling for Android and iOS" FORCE) "Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKL OFF CACHE STRING set(WITH_MKL OFF CACHE STRING
"Disable MKL when cross-compiling for Android and iOS" FORCE) "Disable MKL when cross-compiling for Android and iOS" FORCE)
set(WITH_NGRAPH OFF CACHE STRING
"Disable nGraph when cross-compiling for Android and iOS" FORCE)
set(WITH_GOLANG OFF CACHE STRING set(WITH_GOLANG OFF CACHE STRING
"Disable golang when cross-compiling for Android and iOS" FORCE) "Disable golang when cross-compiling for Android and iOS" FORCE)
...@@ -176,6 +179,7 @@ include(external/protobuf) # download, build, install protobuf ...@@ -176,6 +179,7 @@ include(external/protobuf) # download, build, install protobuf
include(external/python) # download, build, install python include(external/python) # download, build, install python
include(external/openblas) # download, build, install openblas include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn include(external/mkldnn) # download, build, install mkldnn
include(external/ngraph) # download, build, install nGraph
include(external/swig) # download, build, install swig include(external/swig) # download, build, install swig
include(external/boost) # download boost include(external/boost) # download boost
include(external/any) # download libn::any include(external/any) # download libn::any
......
...@@ -37,7 +37,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) ...@@ -37,7 +37,6 @@ SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib") SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
IF(${CBLAS_PROVIDER} STREQUAL "MKLML") IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
add_library(ngraph INTERFACE)
IF(WIN32 OR APPLE)
MESSAGE(WARNING
"Windows or Mac is not supported with nGraph in Paddle yet."
"Force WITH_NGRAPH=OFF")
SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph in Windows and MacOS" FORCE)
ENDIF()
IF(${WITH_NGRAPH} AND NOT ${WITH_MKLDNN})
MESSAGE(WARNING
"nGraph needs mkl-dnn to be enabled."
"Force WITH_NGRAPH=OFF")
SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph if mkl-dnn is disabled" FORCE)
ENDIF()
IF(NOT ${WITH_NGRAPH})
return()
ENDIF()
INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph")
SET(NGRAPH_VERSION "0.9")
SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so)
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2)
SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git")
ExternalProject_Add(
${NGRAPH_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT}
GIT_REPOSITORY ${NGRAPH_GIT_REPO}
GIT_TAG ${NGRAPH_GIT_TAG}
PREFIX ${NGRAPH_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR}
CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE
CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE
CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE
CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE
CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR}
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
)
if(UNIX AND NOT APPLE)
include(GNUInstallDirs)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
else()
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
endif()
MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
# Workaround for nGraph expecting mklml to be in mkldnn install directory.
ExternalProject_Add_Step(
${NGRAPH_PROJECT}
PrepareMKL
COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_LIB} ${MKLDNN_INSTALL_DIR}/lib/libmklml_intel.so
COMMAND ${CMAKE_COMMAND} -E create_symlink ${MKLML_IOMP_LIB} ${MKLDNN_INSTALL_DIR}/lib/libiomp5.so
DEPENDEES download
DEPENDERS configure
)
add_dependencies(ngraph ${NGRAPH_PROJECT})
target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
LIST(APPEND external_project_dependencies ngraph)
...@@ -30,9 +30,7 @@ UNSET_VAR(PROTOBUF_LITE_LIBRARY) ...@@ -30,9 +30,7 @@ UNSET_VAR(PROTOBUF_LITE_LIBRARY)
UNSET_VAR(PROTOBUF_LIBRARY) UNSET_VAR(PROTOBUF_LIBRARY)
UNSET_VAR(PROTOBUF_INCLUDE_DIR) UNSET_VAR(PROTOBUF_INCLUDE_DIR)
UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
function(protobuf_generate_python SRCS)
if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined.
function(protobuf_generate_python SRCS)
# shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
if(NOT ARGN) if(NOT ARGN)
message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
...@@ -52,7 +50,6 @@ if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate ...@@ -52,7 +50,6 @@ if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate
else() else()
set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
endif() endif()
...@@ -77,19 +74,17 @@ if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate ...@@ -77,19 +74,17 @@ if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate
set(FIL_WE "${FIL_DIR}/${FIL_WE}") set(FIL_WE "${FIL_DIR}/${FIL_WE}")
endif() endif()
endif() endif()
list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
add_custom_command( add_custom_command(
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
COMMAND ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE} DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE}
COMMENT "Running Python protocol buffer compiler on ${FIL}" COMMENT "Running Python protocol buffer compiler on ${FIL}"
VERBATIM ) VERBATIM )
endforeach() endforeach()
set(${SRCS} ${${SRCS}} PARENT_SCOPE) set(${SRCS} ${${SRCS}} PARENT_SCOPE)
endfunction() endfunction()
endif()
# Print and set the protobuf library information, # Print and set the protobuf library information,
# finish this cmake process and exit from this file. # finish this cmake process and exit from this file.
...@@ -126,6 +121,7 @@ macro(PROMPT_PROTOBUF_LIB) ...@@ -126,6 +121,7 @@ macro(PROMPT_PROTOBUF_LIB)
# FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
# make `protobuf_generate_cpp` happy. # make `protobuf_generate_cpp` happy.
SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
FOREACH(dep ${protobuf_DEPS}) FOREACH(dep ${protobuf_DEPS})
ADD_DEPENDENCIES(protobuf ${dep}) ADD_DEPENDENCIES(protobuf ${dep})
ADD_DEPENDENCIES(protobuf_lite ${dep}) ADD_DEPENDENCIES(protobuf_lite ${dep})
......
...@@ -5,6 +5,8 @@ list(REMOVE_DUPLICATES GENERAL_OPS) ...@@ -5,6 +5,8 @@ list(REMOVE_DUPLICATES GENERAL_OPS)
set(DEPS_OPS "") set(DEPS_OPS "")
set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h) set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt. DO NOT EDIT!\n\n") file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt. DO NOT EDIT!\n\n")
set(PART_CUDA_KERNEL_FILES)
function(op_library TARGET) function(op_library TARGET)
# op_library is a function to create op library. The interface is same as # op_library is a function to create op library. The interface is same as
# cc_library. But it handle split GPU/CPU code and link some common library # cc_library. But it handle split GPU/CPU code and link some common library
...@@ -37,6 +39,12 @@ function(op_library TARGET) ...@@ -37,6 +39,12 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu)
endif() endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_cu_srcs ${TARGET}.hip.cu) list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
endif() endif()
...@@ -330,6 +338,8 @@ foreach(src ${GENERAL_OPS}) ...@@ -330,6 +338,8 @@ foreach(src ${GENERAL_OPS})
endforeach() endforeach()
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
if (NOT WIN32) if (NOT WIN32)
add_subdirectory(reader) add_subdirectory(reader)
endif(NOT WIN32) endif(NOT WIN32)
...@@ -356,3 +366,14 @@ if(NOT WIN32) ...@@ -356,3 +366,14 @@ if(NOT WIN32)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
if(WITH_GPU)
foreach(CUDA_KERNEL_FILE ${PART_CUDA_KERNEL_FILES})
file(READ ${CUDA_KERNEL_FILE} TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OP_CUDA_KERNEL\\(\\n?([^,]+),.*" MATCHED ${TARGET_CONTENT})
if (MATCHED)
string(STRIP ${CMAKE_MATCH_1} MATCHED)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MATCHED}, CUDA);\n")
endif()
endforeach()
endif()
...@@ -50,7 +50,9 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, ...@@ -50,7 +50,9 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
const Tensor& index, Tensor* output) { const Tensor& index, Tensor* output) {
// PADDLE_ENFORCE(platform::is_gpu_place(place)); // PADDLE_ENFORCE(platform::is_gpu_place(place));
// check index of shape 1-D // check index of shape 1-D
PADDLE_ENFORCE(index.dims().size() == 1); PADDLE_ENFORCE(index.dims().size() == 1 ||
(index.dims().size() == 2 && index.dims()[1] == 1));
int index_size = index.dims()[0]; int index_size = index.dims()[0];
auto src_dims = src.dims(); auto src_dims = src.dims();
......
...@@ -38,7 +38,8 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, ...@@ -38,7 +38,8 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
const Tensor& index, Tensor* output) { const Tensor& index, Tensor* output) {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
// check index of shape 1-D // check index of shape 1-D
PADDLE_ENFORCE(index.dims().size() == 1); PADDLE_ENFORCE(index.dims().size() == 1 ||
(index.dims().size() == 2 && index.dims()[1] == 1));
int64_t index_size = index.dims()[0]; int64_t index_size = index.dims()[0];
auto src_dims = src.dims(); auto src_dims = src.dims();
......
...@@ -31,7 +31,8 @@ class GatherOp : public framework::OperatorWithKernel { ...@@ -31,7 +31,8 @@ class GatherOp : public framework::OperatorWithKernel {
"Output(Out) of GatherOp should not be null."); "Output(Out) of GatherOp should not be null.");
auto index_dims = ctx->GetInputDim("Index"); auto index_dims = ctx->GetInputDim("Index");
PADDLE_ENFORCE(index_dims.size() == 1); PADDLE_ENFORCE(index_dims.size() == 1 ||
(index_dims.size() == 2 && index_dims[1] == 1));
int batch_size = ctx->GetInputDim("Index")[0]; int batch_size = ctx->GetInputDim("Index")[0];
framework::DDim output_dims(ctx->GetInputDim("X")); framework::DDim output_dims(ctx->GetInputDim("X"));
output_dims[0] = batch_size; output_dims[0] = batch_size;
...@@ -53,6 +54,7 @@ class GatherGradOp : public framework::OperatorWithKernel { ...@@ -53,6 +54,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X"));
} }
protected: protected:
......
...@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_max, ...@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_max,
int, ops::MaxFunctor>, int, ops::MaxFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MaxFunctor>); int64_t, ops::MaxFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_OP_CUDA_KERNEL(
reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
...@@ -69,13 +69,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>, ...@@ -69,13 +69,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<float>,
ops::ReduceMeanKernel<double>, ops::ReduceMeanKernel<double>,
ops::ReduceMeanKernel<int>, ops::ReduceMeanKernel<int>,
ops::ReduceMeanKernel<int64_t>); ops::ReduceMeanKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL(
reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MeanGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// .part used to speed up nvcc compile
#include "paddle/fluid/operators/reduce_mean_op.h"
REGISTER_OP_CUDA_KERNEL(
reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MeanGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MeanGradFunctor>);
...@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_min, ...@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_min,
int, ops::MinFunctor>, int, ops::MinFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::MinFunctor>); int64_t, ops::MinFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_min_max_op.h"
REGISTER_OP_CUDA_KERNEL(
reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::MaxOrMinGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::MaxOrMinGradFunctor>);
...@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod, ...@@ -23,12 +23,3 @@ REGISTER_OP_CUDA_KERNEL(reduce_prod,
int, ops::ProdFunctor>, int, ops::ProdFunctor>,
ops::ReduceKernel<paddle::platform::CUDADeviceContext, ops::ReduceKernel<paddle::platform::CUDADeviceContext,
int64_t, ops::ProdFunctor>); int64_t, ops::ProdFunctor>);
REGISTER_OP_CUDA_KERNEL(
reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::ProdGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_prod_op.h"
REGISTER_OP_CUDA_KERNEL(
reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::ProdGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::ProdGradFunctor>);
...@@ -64,13 +64,3 @@ class ReduceSumKernel : public framework::OpKernel<T> { ...@@ -64,13 +64,3 @@ class ReduceSumKernel : public framework::OpKernel<T> {
REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>, REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<float>,
ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>, ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
ops::ReduceSumKernel<int64_t>); ops::ReduceSumKernel<int64_t>);
REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::SumGradFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/cub_reduce.h"
#include "paddle/fluid/operators/reduce_sum_op.h"
REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
float, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::SumGradFunctor>);
...@@ -51,7 +51,8 @@ void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, ...@@ -51,7 +51,8 @@ void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
const Tensor& index, Tensor* output) { const Tensor& index, Tensor* output) {
// PADDLE_ENFORCE(platform::is_gpu_place(place)); // PADDLE_ENFORCE(platform::is_gpu_place(place));
// check index of shape 1-D // check index of shape 1-D
PADDLE_ENFORCE(index.dims().size() == 1); PADDLE_ENFORCE(index.dims().size() == 1 ||
(index.dims().size() == 2 && index.dims()[1] == 1));
int index_size = index.dims()[0]; int index_size = index.dims()[0];
auto src_dims = src.dims(); auto src_dims = src.dims();
......
...@@ -37,7 +37,8 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, ...@@ -37,7 +37,8 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
const Tensor& index, Tensor* output) { const Tensor& index, Tensor* output) {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
// check index of shape 1-D // check index of shape 1-D
PADDLE_ENFORCE(index.dims().size() == 1); PADDLE_ENFORCE(index.dims().size() == 1 ||
(index.dims().size() == 2 && index.dims()[1] == 1));
int index_size = index.dims()[0]; int index_size = index.dims()[0];
auto src_dims = src.dims(); auto src_dims = src.dims();
......
...@@ -21,8 +21,12 @@ REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker, ...@@ -21,8 +21,12 @@ REGISTER_OPERATOR(stack, ops::StackOp, ops::StackOpMaker,
REGISTER_OPERATOR(stack_grad, ops::StackOpGrad); REGISTER_OPERATOR(stack_grad, ops::StackOpGrad);
REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel<plat::CPUDeviceContext, float>, REGISTER_OP_CPU_KERNEL(stack, ops::StackKernel<plat::CPUDeviceContext, float>,
ops::StackKernel<plat::CPUDeviceContext, double>); ops::StackKernel<plat::CPUDeviceContext, double>,
ops::StackKernel<plat::CPUDeviceContext, int>,
ops::StackKernel<plat::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(stack_grad, REGISTER_OP_CPU_KERNEL(stack_grad,
ops::StackGradKernel<plat::CPUDeviceContext, float>, ops::StackGradKernel<plat::CPUDeviceContext, float>,
ops::StackGradKernel<plat::CPUDeviceContext, double>); ops::StackGradKernel<plat::CPUDeviceContext, double>,
ops::StackGradKernel<plat::CPUDeviceContext, int>,
ops::StackGradKernel<plat::CPUDeviceContext, int64_t>);
...@@ -18,8 +18,12 @@ namespace plat = paddle::platform; ...@@ -18,8 +18,12 @@ namespace plat = paddle::platform;
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>, REGISTER_OP_CUDA_KERNEL(stack, ops::StackKernel<plat::CUDADeviceContext, float>,
ops::StackKernel<plat::CUDADeviceContext, double>); ops::StackKernel<plat::CUDADeviceContext, double>,
ops::StackKernel<plat::CUDADeviceContext, int>,
ops::StackKernel<plat::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL(stack_grad, REGISTER_OP_CUDA_KERNEL(stack_grad,
ops::StackGradKernel<plat::CUDADeviceContext, float>, ops::StackGradKernel<plat::CUDADeviceContext, float>,
ops::StackGradKernel<plat::CUDADeviceContext, double>); ops::StackGradKernel<plat::CUDADeviceContext, double>,
ops::StackGradKernel<plat::CUDADeviceContext, int>,
ops::StackGradKernel<plat::CUDADeviceContext, int64_t>);
...@@ -139,6 +139,7 @@ function cmake_gen() { ...@@ -139,6 +139,7 @@ function cmake_gen() {
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-DWITH_MKL=${WITH_MKL:-ON} -DWITH_MKL=${WITH_MKL:-ON}
-DWITH_NGRAPH=${WITH_NGRAPH:-OFF}
-DWITH_AVX=${WITH_AVX:-OFF} -DWITH_AVX=${WITH_AVX:-OFF}
-DWITH_GOLANG=${WITH_GOLANG:-OFF} -DWITH_GOLANG=${WITH_GOLANG:-OFF}
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
...@@ -171,6 +172,7 @@ EOF ...@@ -171,6 +172,7 @@ EOF
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \ -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-DWITH_MKL=${WITH_MKL:-ON} \ -DWITH_MKL=${WITH_MKL:-ON} \
-DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \
-DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_AVX=${WITH_AVX:-OFF} \
-DWITH_GOLANG=${WITH_GOLANG:-OFF} \ -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \ -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
......
...@@ -179,6 +179,18 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': ...@@ -179,6 +179,18 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
raise Exception("patch libmkldnn.so failed, command: %s" % command) raise Exception("patch libmkldnn.so failed, command: %s" % command)
package_data['paddle.libs']+=['libmkldnn.so.0'] package_data['paddle.libs']+=['libmkldnn.so.0']
shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
if '${WITH_NGRAPH}' == 'ON':
if '${CMAKE_BUILD_TYPE}' == 'Release':
# only change rpath in Release mode.
command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
if os.system(command) != 0:
raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
shutil.copy('${NGRAPH_SHARED_LIB}', libs_path)
shutil.copy('${NGRAPH_CPU_LIB}', libs_path)
shutil.copy('${NGRAPH_TBB_LIB}', libs_path)
package_data['paddle.libs']+=['${NGRAPH_SHARED_LIB_NAME}',
'${NGRAPH_CPU_LIB_NAME}',
'${NGRAPH_TBB_LIB_NAME}']
# remove unused paddle/libs/__init__.py # remove unused paddle/libs/__init__.py
if os.path.isfile(libs_path+'/__init__.py'): if os.path.isfile(libs_path+'/__init__.py'):
os.remove(libs_path+'/__init__.py') os.remove(libs_path+'/__init__.py')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册