diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc index 465ad5e0d2089121a0f11ab916afe0420cbcfab7..6eec5d846fa5ef6b25e7646200dad1d452dda806 100644 --- a/paddle/optimizer/adadelta_optimizer.cc +++ b/paddle/optimizer/adadelta_optimizer.cc @@ -27,22 +27,24 @@ void AdadeltaOptimizer::Update(const Tensor* gradient) { const char* AdadeltaOptimizer::SerializeState(int* state_len) { AdadeltaOptimizerState state; - // TODO(zhihong) : add lr_policy serialization state.set_num_sample_passed(num_sample_passed_); + std::string lr_str = this->lr_policy_->SerializeState(state_len); + state.mutable_lr_state()->ParseFromString(lr_str); TensorToProto(*parameter_, state.mutable_parameter()); TensorToProto(*accum_gradient_, state.mutable_accum_gradient()); TensorToProto(*accum_delta_, state.mutable_accum_delta()); TensorToProto(*update_delta_, state.mutable_update_delta()); auto str = state.SerializeAsString(); - *state_len = str.size(); + *state_len += str.size(); return str.c_str(); } void AdadeltaOptimizer::DeserializeState(const std::string& str) { AdadeltaOptimizerState state; state.ParseFromString(str); - // TODO(zhihong) : add lr_policy DeserializeState + auto lr_state = state.lr_state(); + this->lr_policy_->DeserializeState(lr_state.SerializeAsString()); num_sample_passed_ = state.num_sample_passed(); ProtoToTensor(state.parameter(), parameter_); diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc index bdaa7877d2bc58c17c51b977852d4b6fec511ed2..5b92610ac547ee11cedf2e49e4d7f1db4b2da646 100644 --- a/paddle/optimizer/adagrad_optimizer.cc +++ b/paddle/optimizer/adagrad_optimizer.cc @@ -19,20 +19,23 @@ void AdagradOptimizer::Update(const Tensor* gradient) { } const char* AdagradOptimizer::SerializeState(int* state_len) { AdagradOptimizerState state; - // TODO(zhihong) : add lr_policy serialization state.set_num_sample_passed(num_sample_passed_); + std::string lr_str = this->lr_policy_->SerializeState(state_len); + state.mutable_lr_state()->ParseFromString(lr_str); TensorToProto(*parameter_, state.mutable_parameter()); TensorToProto(*accum_gradient_, state.mutable_accum_gradient()); auto str = state.SerializeAsString(); - *state_len = str.size(); + *state_len += str.size(); return str.c_str(); } void AdagradOptimizer::DeserializeState(const std::string& str) { AdagradOptimizerState state; state.ParseFromString(str); - // TODO(zhihong) : add lr_policy DeserializeState + auto lr_state = state.lr_state(); + this->lr_policy_->DeserializeState(lr_state.SerializeAsString()); + num_sample_passed_ = state.num_sample_passed(); ProtoToTensor(state.parameter(), parameter_); ProtoToTensor(state.accum_gradient(), accum_gradient_); diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc index ceab7397d87349c64ca9e5d11990cb38068421be..1ebb6b1e0f7b4edcbac1b28319fd4de576f85f6a 100644 --- a/paddle/optimizer/adam_optimizer.cc +++ b/paddle/optimizer/adam_optimizer.cc @@ -24,20 +24,23 @@ void AdamOptimizer::Update(const Tensor *gradient) { const char *AdamOptimizer::SerializeState(int *state_len) { AdamOptimizerState state; - // TODO(zhihong) : add lr_policy serialization + std::string lr_str = this->lr_policy_->SerializeState(state_len); + state.mutable_lr_state()->ParseFromString(lr_str); state.set_num_sample_passed(num_sample_passed_); + TensorToProto(*parameter_, state.mutable_parameter()); TensorToProto(*momentums_, state.mutable_momentums()); TensorToProto(*velocitys_, state.mutable_velocitys()); auto str = state.SerializeAsString(); - *state_len = str.size(); + *state_len += str.size(); return str.c_str(); } void AdamOptimizer::DeserializeState(const std::string &str) { AdamOptimizerState state; state.ParseFromString(str); - // TODO(zhihong) : add lr_policy DeserializeState + auto lr_state = state.lr_state(); + this->lr_policy_->DeserializeState(lr_state.SerializeAsString()); num_sample_passed_ = state.num_sample_passed(); ProtoToTensor(state.parameter(), parameter_); diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h index d8e33ad37ab4c019a36f63f34babe65cf8c8fb16..036c376e10f465c2866a230caf9224f4af5478bc 100644 --- a/paddle/optimizer/lr_policy.h +++ b/paddle/optimizer/lr_policy.h @@ -17,36 +17,56 @@ public: // constant learning rate policy class ConstLr final : public LrPolicy { public: - ConstLr(double lr) : learning_rate(lr){}; + ConstLr(double lr) : learning_rate_(lr){}; double LearningRate(const uint64_t num_sample_passed) { - return learning_rate; + return learning_rate_; + } + const char *SerializeState(int *state_len) { + LrPolicyState state; + state.set_learning_rate(learning_rate_); + auto str = state.SerializeAsString(); + *state_len = str.size(); + return str.c_str(); + } + void DeserializeState(const std::string &str) { + LrPolicyState state; + state.ParseFromString(str); + learning_rate_ = state.learning_rate(); } - const char *SerializeState(int *state_len) { return nullptr; } - void DeserializeState(const std::string &state) {} private: - double learning_rate; + double learning_rate_; }; class LinearLr final : public LrPolicy { public: LinearLr(double lr, double lr_decay_a, double lr_decay_b) - : learning_rate(lr), lr_decay_a(lr_decay_a), lr_decay_b(lr_decay_b) {} + : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {} double LearningRate(const uint64_t num_sample_passed) { - return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b); + return std::max(learning_rate_ - lr_decay_a_ * num_sample_passed, + lr_decay_b_); } const char *SerializeState(int *state_len) { - // TODO(zhihong) : add lr_policy serialization - return nullptr; + LrPolicyState state; + state.set_learning_rate(learning_rate_); + state.set_lr_decay_a(lr_decay_a_); + state.set_lr_decay_b(lr_decay_b_); + auto str = state.SerializeAsString(); + *state_len = str.size(); + return str.c_str(); } - void DeserializeState(const std::string &state) { - // TODO(zhihong) : add lr_policy serialization + void DeserializeState(const std::string &str) { + LrPolicyState state; + state.ParseFromString(str); + learning_rate_ = state.learning_rate(); + lr_decay_a_ = state.lr_decay_a(); + lr_decay_b_ = state.lr_decay_b(); } private: - double learning_rate; - double lr_decay_a; - double lr_decay_b; + double learning_rate_; + double lr_decay_a_; + double lr_decay_b_; }; } // namespace optimizer diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/optimizer/sgd_optimizer.cc index 34e051003fa83f11b1f4a39c46856e0372836a1a..15418faa840c19e776f293700ee886991754fb04 100644 --- a/paddle/optimizer/sgd_optimizer.cc +++ b/paddle/optimizer/sgd_optimizer.cc @@ -30,16 +30,20 @@ void SGDOptimizer::Update(const Tensor *gradient) { const char *SGDOptimizer::SerializeState(int *state_len) { SGDOptimizerState state; state.set_num_sample_passed(num_sample_passed_); + std::string lr_str = this->lr_policy_->SerializeState(state_len); + state.mutable_lr_state()->ParseFromString(lr_str); TensorToProto(*parameter_, state.mutable_parameter()); if (momentum_ != 0.0) TensorToProto(*momentums_, state.mutable_momentums()); auto str = state.SerializeAsString(); - *state_len = str.size(); + *state_len += str.size(); return str.c_str(); } void SGDOptimizer::DeserializeState(const std::string &str) { SGDOptimizerState state; state.ParseFromString(str); + auto lr_state = state.lr_state(); + this->lr_policy_->DeserializeState(lr_state.SerializeAsString()); num_sample_passed_ = state.num_sample_passed(); ProtoToTensor(state.parameter(), parameter_); if (momentum_ != 0.0) ProtoToTensor(state.parameter(), momentums_); diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto index c698d3c2ddbf58a41ac6ee960af83a257325d1f9..2a87e293f64d3398dea2641c3ff292eceec7e154 100644 --- a/proto/OptimizerConfig.proto +++ b/proto/OptimizerConfig.proto @@ -78,11 +78,15 @@ enum DataType { repeated bytes content = 2; } +message LrPolicyState { + // learninRate Policy + optional double learning_rate = 1 [default = 1.0]; + optional double lr_decay_a = 2; + optional double lr_decay_b = 3; +} + message SGDOptimizerState { - // learning rate policy - optional double learning_rate = 101; - optional double lr_decay_a = 102; - optional double lr_decay_b = 103; + optional LrPolicyState lr_state = 101; optional double num_sample_passed = 104; // state optional TensorProto parameter = 1; @@ -91,9 +95,7 @@ message SGDOptimizerState { message AdadeltaOptimizerState { // learning rate policy - optional double learning_rate = 101; - optional double lr_decay_a = 102; - optional double lr_decay_b = 103; + optional LrPolicyState lr_state = 101; optional double num_sample_passed = 104; // state optional TensorProto parameter = 1; @@ -102,11 +104,9 @@ message AdadeltaOptimizerState { optional TensorProto update_delta = 4; } + message AdagradOptimizerState { - // learning rate policy - optional double learning_rate = 101; - optional double lr_decay_a = 102; - optional double lr_decay_b = 103; + optional LrPolicyState lr_state = 101; optional double num_sample_passed = 104; // state optional TensorProto parameter = 1; @@ -114,10 +114,7 @@ message AdagradOptimizerState { } message AdamOptimizerState { - // learning rate policy - optional double learning_rate = 101; - optional double lr_decay_a = 102; - optional double lr_decay_b = 103; + optional LrPolicyState lr_state = 101; optional double num_sample_passed = 104; // state optional TensorProto parameter = 1;