diff --git a/.clang_format.hook b/.clang_format.hook new file mode 100755 index 0000000000000000000000000000000000000000..1d928216867c0ba3897d71542fea44debf8d72a0 --- /dev/null +++ b/.clang_format.hook @@ -0,0 +1,15 @@ +#!/bin/bash +set -e + +readonly VERSION="3.8" + +version=$(clang-format -version) + +if ! [[ $version == *"$VERSION"* ]]; then + echo "clang-format version check failed." + echo "a version contains '$VERSION' is needed, but get '$version'" + echo "you can install the right version, and make an soft-link to '\$PATH' env" + exit -1 +fi + +clang-format $@ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bb8c88787d37faf9ce4d7d856a307c11f1085d98..a772125df64aaf2eafe6cb9e022f62cc29043eb7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,10 +19,10 @@ - id: end-of-file-fixer - repo: local hooks: - - id: clang-format + - id: clang-format-with-version-check name: clang-format description: Format files with ClangFormat. - entry: clang-format -i + entry: ./.clang_format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ - repo: https://github.com/PaddlePaddle/pre-commit-golang diff --git a/Dockerfile b/Dockerfile index da0047102572d203810d2f9e5ce8ec76063d0cba..98f61ba586a681e53b435d592c8e43b1cc964139 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,20 +71,6 @@ RUN pip install -r /root/requirements.txt RUN apt-get install -y libssl-dev libffi-dev RUN pip install certifi urllib3[secure] -# TODO(qijun) The template library Eigen doesn't work well with GCC 5 -# coming with the default Docker image, so we switch to use GCC 4.8 -# by default. And I will check Eigen library later. - -RUN ln -sf gcc-4.8 /usr/bin/gcc && \ - ln -sf gcc-ar-4.8 /usr/bin/gcc-ar && \ - ln -sf gcc-nm-4.8 /usr/bin/gcc-nm && \ - ln -sf gcc-ranlib-4.8 /usr/bin/gcc-ranlib && \ - ln -sf gcc-4.8 /usr/bin/x86_64-linux-gnu-gcc && \ - ln -sf gcc-ar-4.8 /usr/bin/x86_64-linux-gnu-gcc-ar && \ - ln -sf gcc-nm-4.8 /usr/bin/x86_64-linux-gnu-gcc-nm && \ - ln -sf gcc-ranlib-4.8 /usr/bin/x86_64-linux-gnu-gcc-ranlib && \ - ln -sf g++-4.8 /usr/bin/g++ && \ - ln -sf g++-4.8 /usr/bin/x86_64-linux-gnu-g++ # Install woboq_codebrowser to /woboq RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ diff --git a/cmake/flags.cmake b/cmake/flags.cmake index b27eb71550b68b5c27e47bf067ae0df329bbd628..ff246b2eb4ed97dd14d45763569b661cefd203c8 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -9,13 +9,6 @@ function(CheckCompilerCXX11Flag) if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") endif() - if(NOT ANDROID) - # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem. - # Use Debug mode instead for now. - if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) - set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE) - endif() - endif() elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang" # Apple Clang is a different compiler than upstream Clang which havs different version numbers. @@ -160,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF) # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # So, don't set these flags here. -LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread) +LIST(APPEND CUDA_NVCC_FLAGS -std=c++11) LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math) if(CMAKE_BUILD_TYPE STREQUAL "Debug") diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD index e956994431fbb43438c56dcd96ad8313cf516090..fe8da907d9d45a2164031430ac5b7a3d5523967a 100644 --- a/doc/design/mkldnn/README.MD +++ b/doc/design/mkldnn/README.MD @@ -101,6 +101,7 @@ if use_mkldnn 5. 在**Argument**里添加两个`MkldnnMatrixPtr`,取名为`mkldnnValue`和`mkldnnGrad`,用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名),用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。 6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑,用于判断`deviceId`,并针对device在MKL-DNN和CPU之间不统一的情况,做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。 7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 +8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况,所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面,一直保存的是0,所以可以充分利用这个信息,定义一个枚举处理所有MKLDNN的参数格式,从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。 ## References diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 03985260241689a099ae9ebc136bd04831a44167..68304c9fc8b8fa13cb1f99b82517abc87c71496c 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -38,7 +38,7 @@ add_custom_command(TARGET framework_py_proto POST_BUILD WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) cc_library(backward SRCS backward.cc DEPS net_op) -cc_test(backward_test SRCS backward_test.cc DEPS backward) +cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context) if(WITH_PYTHON) cc_library(paddle_pybind SHARED diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 83b7e4cdac9bc79ebf687cf199f6d2bc8d1695cf..c226e4e3d2a58d1a647e204c4cd26f4eb6bcd968 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -17,6 +17,7 @@ #include #include "paddle/framework/op_registry.h" #include "paddle/operators/net_op.h" +#include "paddle/operators/recurrent_op.h" namespace paddle { namespace framework { @@ -178,6 +179,22 @@ std::shared_ptr BackwardRecursive( return false; }); + // process recurrent gradient op as a special operator. + if (forwardOp.Type() == "recurrent_op") { + // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or + // this will result in infinite loop. + const auto& rnnop = + *static_cast(&forwardOp); + auto rnn_grad_op = + static_cast(grad_op.get()); + const auto& stepnet_op = + *static_cast(&rnnop.stepnet()); + // create stepnet's gradient op + auto grad_stepnet = BackwardRecursive(stepnet_op, no_grad_names, uniq_id); + rnn_grad_op->set_stepnet( + std::static_pointer_cast(grad_stepnet)); + } + if (net->ops_.empty()) { // Current no aux op is added to network return grad_op; } diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt index 8035d93bfec75b20a54c5af0521ab724cafba8ca..9cc4233e43267472d405c3e4e617f0782e1430ea 100644 --- a/paddle/memory/CMakeLists.txt +++ b/paddle/memory/CMakeLists.txt @@ -1,7 +1,7 @@ add_subdirectory(detail) cc_library(memory SRCS memory.cc) -cc_library(memcpy SRCS memcpy.cc DEPS device_context) +cc_library(memcpy SRCS memcpy.cc) cc_library(paddle_memory DEPS diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc index aaab1142ca18d3319469a4d685fde9d30929113f..a19a3e3675e3e2e7cc0c3594f21191f932d6379f 100644 --- a/paddle/memory/memcpy.cc +++ b/paddle/memory/memcpy.cc @@ -16,8 +16,6 @@ limitations under the License. */ #include // for memcpy -#include "paddle/platform/device_context.h" - namespace paddle { namespace memory { diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h index caca644c96c3f8c741bac4a3b5a6239d2a4555c7..171a0bd2ae80245939a9237f51d195e8bc9178fc 100644 --- a/paddle/operators/recurrent_op.h +++ b/paddle/operators/recurrent_op.h @@ -127,7 +127,7 @@ class RecurrentOp final : public framework::OperatorBase { } void set_stepnet(std::shared_ptr net) { stepnet_ = net; } - const NetOp* stepnet() const { return stepnet_.get(); } + const NetOp& stepnet() const { return *stepnet_; } static const rnn::ArgumentName kArgName; @@ -158,7 +158,7 @@ class RecurrentGradientOp final : public framework::OperatorBase { static const rnn::ArgumentName kArgName; void set_stepnet(const std::shared_ptr& net) { stepnet_ = net; } - const NetOp* stepnet() const { return stepnet_.get(); } + const NetOp& stepnet() const { return *stepnet_; } private: RecurrentGradientAlgorithm alg_; diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 4154aad15c39119e2f155cb2c7b5177b5aa78022..acfc0639736beb82df41b851664e7bcd079b5eb1 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -16,5 +16,8 @@ ELSE() set(GPU_CTX_DEPS) ENDIF() -cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS}) +# memcpy deoends on device_context, here add deps individually for +# avoiding cycle dependencies +cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator + system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info) diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index a928e097787db9deebe1c6eab263190caacac7eb..f92c15ae450e94de44d27e77763e791e6bae4426 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/platform/device_context.h" +#include "paddle/memory/memory.h" namespace paddle { namespace platform { @@ -36,6 +37,59 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); } #ifndef PADDLE_ONLY_CPU +class EigenCudaStreamDevice : public Eigen::StreamInterface { + public: + EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { + Eigen::initializeDeviceProp(); + } + ~EigenCudaStreamDevice() override {} + + void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) { + stream_ = cuda_stream; + place_ = place; + device_prop_ = &Eigen::m_deviceProperties[place.device]; + } + + const cudaStream_t& stream() const override { return *stream_; } + + const cudaDeviceProp& deviceProperties() const override { + return *device_prop_; + } + + void* allocate(size_t num_bytes) const override { + return paddle::memory::Alloc(place_, num_bytes); + } + + void deallocate(void* buffer) const override { + paddle::memory::Free(place_, buffer); + } + + void* scratchpad() const override { + if (scratch_ == NULL) { + scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + unsigned int* semaphore() const override { + if (semaphore_ == NULL) { + char* scratch = + static_cast(scratchpad()) + Eigen::kCudaScratchSize; + semaphore_ = reinterpret_cast(scratch); + PADDLE_ENFORCE( + cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); + } + return semaphore_; + } + + private: + GPUPlace place_; + const cudaStream_t* stream_; // not owned; + const cudaDeviceProp* device_prop_; // not owned; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + template <> Eigen::GpuDevice* DeviceContext::get_eigen_device() const { return reinterpret_cast(this)->eigen_device(); @@ -43,19 +97,9 @@ Eigen::GpuDevice* DeviceContext::get_eigen_device() const { CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { SetDeviceId(place_.device); - // TODO(qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly - // here will cause segment fault. We must implement a class derived from - // Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id - // later. Please refer to the implementation of class EigenCudaStreamDevice - // in TensorFlow. - // - // We find that CUDA 7 introduces a new option, the per-thread default stream, - // that has two effects. Please refer to https://devblogs.nvidia.com/ - // parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ - // - // So, we decide to use default stream and add –default-stream per-thread nvcc - // flag. Than, two threads with two CUDADeviceContexts will run parallelly. - eigen_stream_.reset(new Eigen::CudaStreamDevice()); + PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + eigen_stream_.reset(new EigenCudaStreamDevice()); + eigen_stream_->Reinitialize(&stream_, place); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); } @@ -75,12 +119,13 @@ CUDADeviceContext::~CUDADeviceContext() { } eigen_stream_.reset(); eigen_device_.reset(); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); } Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { - PADDLE_ENFORCE(cudaStreamSynchronize(0)); + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); } Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { @@ -91,6 +136,7 @@ cublasHandle_t CUDADeviceContext::cublas_handle() { if (!cublas_handle_) { SetDeviceId(place_.device); PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); + PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); } return cublas_handle_; } @@ -99,10 +145,13 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() { if (!cudnn_handle_) { SetDeviceId(place_.device); PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); + PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_)); } return cudnn_handle_; } +cudaStream_t CUDADeviceContext::stream() { return stream_; } + curandGenerator_t CUDADeviceContext::curand_generator() { if (!curand_generator_) { SetDeviceId(place_.device); @@ -110,6 +159,8 @@ curandGenerator_t CUDADeviceContext::curand_generator() { CURAND_RNG_PSEUDO_DEFAULT)); PADDLE_ENFORCE( dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_)); + + PADDLE_ENFORCE(dynload::curandSetStream(curand_generator_, stream_)); } return curand_generator_; } diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 08b5b2cff900cc4239a615fe7d7f6b5faa13510b..c5042ae33e47e04521e59e0d91ddd8d4efffe50a 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -52,6 +52,7 @@ class CPUDeviceContext : public DeviceContext { }; #ifndef PADDLE_ONLY_CPU +class EigenCudaStreamDevice; class CUDADeviceContext : public DeviceContext { public: @@ -76,6 +77,9 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return curand handle in the device context. */ curandGenerator_t curand_generator(); + + /*! \brief Return cuda stream in the device context. */ + cudaStream_t stream(); // clang-format on private: @@ -83,15 +87,16 @@ class CUDADeviceContext : public DeviceContext { private: std::unique_ptr eigen_device_; - std::unique_ptr eigen_stream_; + std::unique_ptr eigen_stream_; private: uint64_t seed_; // clang-format off - cudnnHandle_t cudnn_handle_ = nullptr; - cublasHandle_t cublas_handle_ = nullptr; - curandGenerator_t curand_generator_ = nullptr; + cudaStream_t stream_{nullptr}; + cudnnHandle_t cudnn_handle_{nullptr}; + cublasHandle_t cublas_handle_{nullptr}; + curandGenerator_t curand_generator_{nullptr}; // clang-format on }; diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc index 65345c433c0a328e7f89038a39312edba35eb8c7..8b764bdcd9d92e6b2203e45160acee35ec110538 100644 --- a/paddle/platform/device_context_test.cc +++ b/paddle/platform/device_context_test.cc @@ -45,6 +45,7 @@ TEST(Device, CUDADeviceContext) { ASSERT_NE(nullptr, cublas_handle); curandGenerator_t curand_handle = device_context->curand_generator(); ASSERT_NE(nullptr, curand_handle); + ASSERT_NE(nullptr, device_context->stream()); delete device_context; } }