diff --git a/CMakeLists.txt b/CMakeLists.txt index 5bedbbefa85a730ff2934a12597988a67e73c1a4..15a7c6b07417adfacd461e95c0b92f658e1e11cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,7 @@ cmake_minimum_required(VERSION 3.0) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR}) +set(PROJ_BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR}) include(system) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 8f65decda77946878daffda0afffab8e0b74b83d..725cf28037182bfd0a27d49491bc8a479a9b4440 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -88,7 +88,7 @@ # # including binary directory for generated headers. -include_directories(${CMAKE_BINARY_DIR}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}) if(NOT APPLE) find_package(Threads REQUIRED) @@ -106,7 +106,7 @@ function(merge_static_libs TARGET_NAME) if(APPLE) # Use OSX's libtool to merge archives # To produce a library we need at least one source file. - # It is created by add_custom_command below and will helps + # It is created by add_custom_command below and will helps # also help to track dependencies. set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) @@ -144,24 +144,24 @@ function(merge_static_libs TARGET_NAME) DEPENDS ${lib} ${objdir} WORKING_DIRECTORY ${objdir}) - # Empty dummy source file that goes into merged library - set(mergebase ${lib}.mergebase.c) - add_custom_command(OUTPUT ${mergebase} - COMMAND ${CMAKE_COMMAND} -E touch ${mergebase} - DEPENDS ${objlistfile}) + # Empty dummy source file that goes into merged library + set(mergebase ${lib}.mergebase.c) + add_custom_command(OUTPUT ${mergebase} + COMMAND ${CMAKE_COMMAND} -E touch ${mergebase} + DEPENDS ${objlistfile}) list(APPEND mergebases "${mergebase}") endforeach() add_library(${TARGET_NAME} STATIC ${mergebases}) - target_link_libraries(${TARGET_NAME} ${libs_deps}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) # Get the file name of the generated library set(outlibfile "$") foreach(lib ${libs}) add_custom_command(TARGET ${TARGET_NAME} POST_BUILD - COMMAND ${CMAKE_AR} cr ${outlibfile} *.o + COMMAND ${CMAKE_AR} cr ${outlibfile} *.o COMMAND ${CMAKE_RANLIB} ${outlibfile} WORKING_DIRECTORY ${lib}.objdir) endforeach() @@ -362,4 +362,4 @@ function(py_proto_compile TARGET_NAME) set(py_srcs) protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs}) -endfunction() \ No newline at end of file +endfunction() diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc index 465ad5e0d2089121a0f11ab916afe0420cbcfab7..6eec5d846fa5ef6b25e7646200dad1d452dda806 100644 --- a/paddle/optimizer/adadelta_optimizer.cc +++ b/paddle/optimizer/adadelta_optimizer.cc @@ -27,22 +27,24 @@ void AdadeltaOptimizer::Update(const Tensor* gradient) { const char* AdadeltaOptimizer::SerializeState(int* state_len) { AdadeltaOptimizerState state; - // TODO(zhihong) : add lr_policy serialization state.set_num_sample_passed(num_sample_passed_); + std::string lr_str = this->lr_policy_->SerializeState(state_len); + state.mutable_lr_state()->ParseFromString(lr_str); TensorToProto(*parameter_, state.mutable_parameter()); TensorToProto(*accum_gradient_, state.mutable_accum_gradient()); TensorToProto(*accum_delta_, state.mutable_accum_delta()); TensorToProto(*update_delta_, state.mutable_update_delta()); auto str = state.SerializeAsString(); - *state_len = str.size(); + *state_len += str.size(); return str.c_str(); } void AdadeltaOptimizer::DeserializeState(const std::string& str) { AdadeltaOptimizerState state; state.ParseFromString(str); - // TODO(zhihong) : add lr_policy DeserializeState + auto lr_state = state.lr_state(); + this->lr_policy_->DeserializeState(lr_state.SerializeAsString()); num_sample_passed_ = state.num_sample_passed(); ProtoToTensor(state.parameter(), parameter_); diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc index bdaa7877d2bc58c17c51b977852d4b6fec511ed2..5b92610ac547ee11cedf2e49e4d7f1db4b2da646 100644 --- a/paddle/optimizer/adagrad_optimizer.cc +++ b/paddle/optimizer/adagrad_optimizer.cc @@ -19,20 +19,23 @@ void AdagradOptimizer::Update(const Tensor* gradient) { } const char* AdagradOptimizer::SerializeState(int* state_len) { AdagradOptimizerState state; - // TODO(zhihong) : add lr_policy serialization state.set_num_sample_passed(num_sample_passed_); + std::string lr_str = this->lr_policy_->SerializeState(state_len); + state.mutable_lr_state()->ParseFromString(lr_str); TensorToProto(*parameter_, state.mutable_parameter()); TensorToProto(*accum_gradient_, state.mutable_accum_gradient()); auto str = state.SerializeAsString(); - *state_len = str.size(); + *state_len += str.size(); return str.c_str(); } void AdagradOptimizer::DeserializeState(const std::string& str) { AdagradOptimizerState state; state.ParseFromString(str); - // TODO(zhihong) : add lr_policy DeserializeState + auto lr_state = state.lr_state(); + this->lr_policy_->DeserializeState(lr_state.SerializeAsString()); + num_sample_passed_ = state.num_sample_passed(); ProtoToTensor(state.parameter(), parameter_); ProtoToTensor(state.accum_gradient(), accum_gradient_); diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc index ceab7397d87349c64ca9e5d11990cb38068421be..1ebb6b1e0f7b4edcbac1b28319fd4de576f85f6a 100644 --- a/paddle/optimizer/adam_optimizer.cc +++ b/paddle/optimizer/adam_optimizer.cc @@ -24,20 +24,23 @@ void AdamOptimizer::Update(const Tensor *gradient) { const char *AdamOptimizer::SerializeState(int *state_len) { AdamOptimizerState state; - // TODO(zhihong) : add lr_policy serialization + std::string lr_str = this->lr_policy_->SerializeState(state_len); + state.mutable_lr_state()->ParseFromString(lr_str); state.set_num_sample_passed(num_sample_passed_); + TensorToProto(*parameter_, state.mutable_parameter()); TensorToProto(*momentums_, state.mutable_momentums()); TensorToProto(*velocitys_, state.mutable_velocitys()); auto str = state.SerializeAsString(); - *state_len = str.size(); + *state_len += str.size(); return str.c_str(); } void AdamOptimizer::DeserializeState(const std::string &str) { AdamOptimizerState state; state.ParseFromString(str); - // TODO(zhihong) : add lr_policy DeserializeState + auto lr_state = state.lr_state(); + this->lr_policy_->DeserializeState(lr_state.SerializeAsString()); num_sample_passed_ = state.num_sample_passed(); ProtoToTensor(state.parameter(), parameter_); diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h index d8e33ad37ab4c019a36f63f34babe65cf8c8fb16..036c376e10f465c2866a230caf9224f4af5478bc 100644 --- a/paddle/optimizer/lr_policy.h +++ b/paddle/optimizer/lr_policy.h @@ -17,36 +17,56 @@ public: // constant learning rate policy class ConstLr final : public LrPolicy { public: - ConstLr(double lr) : learning_rate(lr){}; + ConstLr(double lr) : learning_rate_(lr){}; double LearningRate(const uint64_t num_sample_passed) { - return learning_rate; + return learning_rate_; + } + const char *SerializeState(int *state_len) { + LrPolicyState state; + state.set_learning_rate(learning_rate_); + auto str = state.SerializeAsString(); + *state_len = str.size(); + return str.c_str(); + } + void DeserializeState(const std::string &str) { + LrPolicyState state; + state.ParseFromString(str); + learning_rate_ = state.learning_rate(); } - const char *SerializeState(int *state_len) { return nullptr; } - void DeserializeState(const std::string &state) {} private: - double learning_rate; + double learning_rate_; }; class LinearLr final : public LrPolicy { public: LinearLr(double lr, double lr_decay_a, double lr_decay_b) - : learning_rate(lr), lr_decay_a(lr_decay_a), lr_decay_b(lr_decay_b) {} + : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {} double LearningRate(const uint64_t num_sample_passed) { - return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b); + return std::max(learning_rate_ - lr_decay_a_ * num_sample_passed, + lr_decay_b_); } const char *SerializeState(int *state_len) { - // TODO(zhihong) : add lr_policy serialization - return nullptr; + LrPolicyState state; + state.set_learning_rate(learning_rate_); + state.set_lr_decay_a(lr_decay_a_); + state.set_lr_decay_b(lr_decay_b_); + auto str = state.SerializeAsString(); + *state_len = str.size(); + return str.c_str(); } - void DeserializeState(const std::string &state) { - // TODO(zhihong) : add lr_policy serialization + void DeserializeState(const std::string &str) { + LrPolicyState state; + state.ParseFromString(str); + learning_rate_ = state.learning_rate(); + lr_decay_a_ = state.lr_decay_a(); + lr_decay_b_ = state.lr_decay_b(); } private: - double learning_rate; - double lr_decay_a; - double lr_decay_b; + double learning_rate_; + double lr_decay_a_; + double lr_decay_b_; }; } // namespace optimizer diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/optimizer/sgd_optimizer.cc index 34e051003fa83f11b1f4a39c46856e0372836a1a..15418faa840c19e776f293700ee886991754fb04 100644 --- a/paddle/optimizer/sgd_optimizer.cc +++ b/paddle/optimizer/sgd_optimizer.cc @@ -30,16 +30,20 @@ void SGDOptimizer::Update(const Tensor *gradient) { const char *SGDOptimizer::SerializeState(int *state_len) { SGDOptimizerState state; state.set_num_sample_passed(num_sample_passed_); + std::string lr_str = this->lr_policy_->SerializeState(state_len); + state.mutable_lr_state()->ParseFromString(lr_str); TensorToProto(*parameter_, state.mutable_parameter()); if (momentum_ != 0.0) TensorToProto(*momentums_, state.mutable_momentums()); auto str = state.SerializeAsString(); - *state_len = str.size(); + *state_len += str.size(); return str.c_str(); } void SGDOptimizer::DeserializeState(const std::string &str) { SGDOptimizerState state; state.ParseFromString(str); + auto lr_state = state.lr_state(); + this->lr_policy_->DeserializeState(lr_state.SerializeAsString()); num_sample_passed_ = state.num_sample_passed(); ProtoToTensor(state.parameter(), parameter_); if (momentum_ != 0.0) ProtoToTensor(state.parameter(), momentums_); diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index cc6b52e9271ff00e91f2ca172815c543eb99261d..ebacd5d6dc850ebde5212bba39aea4dde8e4eb03 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -4,3 +4,5 @@ nv_test(cuda_test SRCS cuda_test.cu) cc_library(place SRCS place.cc) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) + +nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 glog gflags) diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h new file mode 100644 index 0000000000000000000000000000000000000000..fcef0a5e3058f1c9d54f9e06a54a09286e2454fd --- /dev/null +++ b/paddle/platform/device_context.h @@ -0,0 +1,159 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/framework/enforce.h" +#ifndef PADDLE_ONLY_CPU +#include "paddle/platform/cuda.h" +#include "paddle/platform/dynload/cublas.h" +#include "paddle/platform/dynload/cudnn.h" +#include "paddle/platform/dynload/curand.h" +#define EIGEN_USE_GPU +#endif +#include "paddle/platform/place.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace platform { + +class DeviceContext { + public: + virtual ~DeviceContext() {} +}; + +class CPUDeviceContext : public DeviceContext {}; + +#ifndef PADDLE_ONLY_CPU +class GPUPlaceGuard { + public: + explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) { + if (previous_ != new_place) { + paddle::platform::SetDeviceId(new_place.device); + } + } + + ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); } + + private: + GPUPlace previous_; +}; + +class CUDADeviceContext : public DeviceContext { + public: + explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) { + GPUPlaceGuard guard(gpu_place_); + paddle::platform::throw_on_error(cudaStreamCreate(&stream_), + "cudaStreamCreate failed"); + eigen_stream_ = new Eigen::CudaStreamDevice(&stream_); + eigen_device_ = new Eigen::GpuDevice(eigen_stream_); + } + + void Wait() { + paddle::platform::throw_on_error(cudaStreamSynchronize(stream_), + "cudaStreamSynchronize failed"); + } + + cudaStream_t stream() { return stream_; } + + Eigen::GpuDevice eigen_device() { return *eigen_device_; } + + cublasHandle_t cublas_handle() { + if (!blas_handle_) { + GPUPlaceGuard guard(gpu_place_); + PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_) == + CUBLAS_STATUS_SUCCESS, + "cublasCreate failed"); + PADDLE_ENFORCE(paddle::platform::dynload::cublasSetStream( + blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS, + "cublasSetStream failed"); + } + return blas_handle_; + } + + cudnnHandle_t cudnn_handle() { + if (!dnn_handle_) { + GPUPlaceGuard guard(gpu_place_); + PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_) == + CUDNN_STATUS_SUCCESS, + "cudnnCreate failed"); + PADDLE_ENFORCE(paddle::platform::dynload::cudnnSetStream( + dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS, + "cudnnSetStream failed"); + } + return dnn_handle_; + } + + curandGenerator_t curand_generator() { + if (!rand_generator_) { + GPUPlaceGuard guard(gpu_place_); + PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator( + &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) == + CURAND_STATUS_SUCCESS, + "curandCreateGenerator failed"); + PADDLE_ENFORCE( + paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed( + rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS, + "curandSetPseudoRandomGeneratorSeed failed"); + PADDLE_ENFORCE(paddle::platform::dynload::curandSetStream( + rand_generator_, stream_) == CURAND_STATUS_SUCCESS, + "curandSetStream failed"); + } + return rand_generator_; + } + + ~CUDADeviceContext() { + Wait(); + if (blas_handle_) { + PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_) == + CUBLAS_STATUS_SUCCESS, + "cublasDestroy failed"); + } + + if (dnn_handle_) { + PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_) == + CUDNN_STATUS_SUCCESS, + "cudnnDestroy failed"); + } + + if (rand_generator_) { + PADDLE_ENFORCE(paddle::platform::dynload::curandDestroyGenerator( + rand_generator_) == CURAND_STATUS_SUCCESS, + "curandDestroyGenerator failed"); + } + + delete eigen_stream_; + delete eigen_device_; + + paddle::platform::throw_on_error(cudaStreamDestroy(stream_), + "cudaStreamDestroy failed"); + } + + private: + GPUPlace gpu_place_; + cudaStream_t stream_; + + Eigen::CudaStreamDevice* eigen_stream_; + Eigen::GpuDevice* eigen_device_; + + cublasHandle_t blas_handle_{nullptr}; + + cudnnHandle_t dnn_handle_{nullptr}; + + int random_seed_; + curandGenerator_t rand_generator_{nullptr}; +}; +#endif +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..61be4a307dbf073be7dff4564183240834cc7df6 --- /dev/null +++ b/paddle/platform/device_context_test.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/device_context.h" +#include "gtest/gtest.h" + +TEST(CUDADeviceContext, Init) { + int count = paddle::platform::GetDeviceCount(); + for (int i = 0; i < count; i++) { + paddle::platform::CUDADeviceContext* device_context = + new paddle::platform::CUDADeviceContext(i); + Eigen::GpuDevice gpu_device = device_context->eigen_device(); + ASSERT_NE(nullptr, gpu_device.stream()); + cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); + ASSERT_NE(nullptr, cudnn_handle); + cublasHandle_t cublas_handle = device_context->cublas_handle(); + ASSERT_NE(nullptr, cublas_handle); + curandGenerator_t curand_handle = device_context->curand_generator(); + ASSERT_NE(nullptr, curand_handle); + delete device_context; + } +} diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto index c698d3c2ddbf58a41ac6ee960af83a257325d1f9..2a87e293f64d3398dea2641c3ff292eceec7e154 100644 --- a/proto/OptimizerConfig.proto +++ b/proto/OptimizerConfig.proto @@ -78,11 +78,15 @@ enum DataType { repeated bytes content = 2; } +message LrPolicyState { + // learninRate Policy + optional double learning_rate = 1 [default = 1.0]; + optional double lr_decay_a = 2; + optional double lr_decay_b = 3; +} + message SGDOptimizerState { - // learning rate policy - optional double learning_rate = 101; - optional double lr_decay_a = 102; - optional double lr_decay_b = 103; + optional LrPolicyState lr_state = 101; optional double num_sample_passed = 104; // state optional TensorProto parameter = 1; @@ -91,9 +95,7 @@ message SGDOptimizerState { message AdadeltaOptimizerState { // learning rate policy - optional double learning_rate = 101; - optional double lr_decay_a = 102; - optional double lr_decay_b = 103; + optional LrPolicyState lr_state = 101; optional double num_sample_passed = 104; // state optional TensorProto parameter = 1; @@ -102,11 +104,9 @@ message AdadeltaOptimizerState { optional TensorProto update_delta = 4; } + message AdagradOptimizerState { - // learning rate policy - optional double learning_rate = 101; - optional double lr_decay_a = 102; - optional double lr_decay_b = 103; + optional LrPolicyState lr_state = 101; optional double num_sample_passed = 104; // state optional TensorProto parameter = 1; @@ -114,10 +114,7 @@ message AdagradOptimizerState { } message AdamOptimizerState { - // learning rate policy - optional double learning_rate = 101; - optional double lr_decay_a = 102; - optional double lr_decay_b = 103; + optional LrPolicyState lr_state = 101; optional double num_sample_passed = 104; // state optional TensorProto parameter = 1; diff --git a/python/setup.py.in b/python/setup.py.in index eeffbfe80e3b4b5473b06f7372addd9870034e77..a422b3832f4c9c60bc5406277f9ada7032f85f51 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -34,6 +34,6 @@ setup(name='paddle', '': '${CMAKE_CURRENT_SOURCE_DIR}', # The paddle.v2.framework.proto will be generated while compiling. # So that package points to other directory. - 'paddle.v2.framework.proto': '${CMAKE_BINARY_DIR}/paddle/framework' + 'paddle.v2.framework.proto': '${PROJ_BINARY_ROOT}/paddle/framework' }, )