diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc index 64672da0c04d14eefa42dd828ccd8d6026a1230f..8d5865cd20fd1e0a8f413e080ddd27197f68bd64 100644 --- a/paddle/optimizer/adadelta_optimizer.cc +++ b/paddle/optimizer/adadelta_optimizer.cc @@ -5,14 +5,6 @@ namespace paddle { namespace optimizer { -void AdadeltaOptimizer::set_weight(Tensor* p) { - parameter_ = p; - size_t size = p->size(); - accum_gradient_ = new Tensor(size); - accum_delta_ = new Tensor(size); - update_delta_ = new Tensor(size); -} - void AdadeltaOptimizer::Update(const Tensor* gradient) { num_sample_passed_ += 1; double learning_rate = lr_policy_->LearningRate(num_sample_passed_); @@ -32,5 +24,39 @@ void AdadeltaOptimizer::Update(const Tensor* gradient) { param[i] -= learning_rate * update_d[i] + learning_rate * decay_ * param[i]; } } + +const char* AdadeltaOptimizer::SerializeState(int* state_len) { + OptimizerState state; + state.set_learning_rate(lr_policy_->LearningRate(num_sample_passed_)); + state.set_num_sample_passed(num_sample_passed_); + + TensorToProto(*parameter_, state.mutable_parameter()); + TensorToProto(*accum_gradient_, state.mutable_accum_gradient()); + TensorToProto(*accum_delta_, state.mutable_accum_delta()); + TensorToProto(*update_delta_, state.mutable_update_delta()); + state.set_nesterov(epsilon_); + state.set_momentum(rho_); + state.set_decay(decay_); + *state_len += CalStateSize(parameter_, + accum_gradient_, + accum_delta_, + update_delta_, + rho_, + epsilon_, + decay_); + return state.SerializeAsString().c_str(); +} + +void AdadeltaOptimizer::DeSerializeState(const std::string& str) { + OptimizerState state; + state.ParseFromString(str); + lr_policy_->set(state.learning_rate()); + num_sample_passed_ = state.num_sample_passed(); + + ProtoToTensor(state.parameter(), parameter_); + ProtoToTensor(state.accum_gradient(), accum_gradient_); + ProtoToTensor(state.accum_delta(), accum_delta_); + ProtoToTensor(state.update_delta(), update_delta_); + +} // namespace optimizer } // namespace optimizer -} // namespace paddle diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h index 4f6137cf2e5aca82a6f0b5afee8afec699bfe313..31f29f867521e30f771b7e9a3b34834974746297 100644 --- a/paddle/optimizer/adadelta_optimizer.h +++ b/paddle/optimizer/adadelta_optimizer.h @@ -7,21 +7,31 @@ namespace optimizer { class AdadeltaOptimizer : public ParameterOptimizer { public: - AdadeltaOptimizer(double rho, double epsilon, double decay, LrPolicy *lr) - : ParameterOptimizer(lr), + AdadeltaOptimizer( + Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay) + : ParameterOptimizer(parameter, lr), accum_gradient_(nullptr), accum_delta_(nullptr), update_delta_(nullptr), rho_(rho), epsilon_(epsilon), - decay_(decay) {} + decay_(decay) { + size_t size = p->size(); + if (accum_gradient_) delete accum_gradient_; + accum_gradient_ = new Tensor(size); + if (accum_delta_) delete accum_delta_; + accum_delta_ = new Tensor(size); + if (update_delta_) delete update_delta_; + update_delta_ = new Tensor(size); + } ~AdadeltaOptimizer() { if (accum_gradient_) delete accum_gradient_; if (accum_delta_) delete accum_delta_; if (update_delta_) delete update_delta_; } void Update(const Tensor *gradient); - void set_weight(Tensor *p); + const char *SerializeState(int *state_len); + void DeSerializeState(const std::string &state); private: Tensor *accum_gradient_; diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc index 1698c2abdbb85192a46ff77f014b5e94635960f7..6a17cf0ed06a1c50dc81414312f6b071b2f42f18 100644 --- a/paddle/optimizer/adagrad_optimizer.cc +++ b/paddle/optimizer/adagrad_optimizer.cc @@ -5,12 +5,6 @@ namespace paddle { namespace optimizer { -void AdagradOptimizer::set_weight(Tensor* p) { - parameter_ = p; - size_t size = p->size(); - accum_gradient_ = new Tensor(size); -} - void AdagradOptimizer::Update(const Tensor* gradient) { num_sample_passed_ += 1; double learning_rate = lr_policy_->LearningRate(num_sample_passed_); @@ -23,6 +17,8 @@ void AdagradOptimizer::Update(const Tensor* gradient) { learning_rate * decay_ * param[i]; } } +const char* SGDOptimizer::SerializeState(int* state_len) { NIMPL; } +void SGDOptimizer::DeSerializeState(const std::string& str) { NIMPL; } +// namespace optimizer } // namespace optimizer -} // namespace paddle diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h index 1b05e99754bfb29ed95b29450310dc7888f9d1d1..14a32cb683aaa8bf7553a46a24bc2d7cc899736d 100644 --- a/paddle/optimizer/adagrad_optimizer.h +++ b/paddle/optimizer/adagrad_optimizer.h @@ -7,16 +7,19 @@ namespace optimizer { class AdagradOptimizer : public ParameterOptimizer { public: - AdagradOptimizer(double epsilon, double decay, LrPolicy *lr) - : ParameterOptimizer(lr), - accum_gradient_(nullptr), - epsilon_(epsilon), - decay_(decay) {} + AdagradOptimizer(Tensor *parameter, + LrPolicy *lr, + double epsilon, + double decay) + : ParameterOptimizer(parameter, lr), epsilon_(epsilon), decay_(decay) { + size_t size = p->size(); + if (accum_gradient_) delete accum_gradient_; + accum_gradient_ = new Tensor(size); + } ~AdagradOptimizer() { if (accum_gradient_) delete accum_gradient_; } void Update(const Tensor *gradient); - void set_weight(Tensor *p); private: Tensor *accum_gradient_; diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc index d052ac8f9a199337cdfe14bdb3d443e4c25b98e7..20cf4ef5a11cd660207c21bcba77f68f7c11897c 100644 --- a/paddle/optimizer/adam_optimizer.cc +++ b/paddle/optimizer/adam_optimizer.cc @@ -28,5 +28,33 @@ void AdamOptimizer::Update(const Tensor *gradient) { learning_rate * (m[i] / std::sqrt(v[i] + epsilon_) + decay_ * param[i]); } } + +const char *AdadeltaOptimizer::SerializeState(int *state_len) { + OptimizerState state; + state.set_learning_rate(lr_policy_->LearningRate(num_sample_passed_)); + state.set_num_sample_passed(num_sample_passed_); + + TensorToProto(*parameter_, state.mutable_parameter()); + TensorToProto(*velocitys_, state.mutable_momentums()); + + state.set_beta_1(beta_1_); + state.set_beta_2(beta_2_); + state.set_decay(decay_); + *state_len += CalStateSize( + parameter_, momentums_, velocitys_, beta_1_, beta_2, epsilon_ decay_); + return state.SerializeAsString().c_str(); +} + +void AdadeltaOptimizer::DeSerializeState(const std::string &str) { + OptimizerState state; + state.ParseFromString(str); + lr_policy_->set(state.learning_rate()); + num_sample_passed_ = state.num_sample_passed(); + + ProtoToTensor(state.parameter(), parameter_); + ProtoToTensor(state.velocitys(), velocitys__); + beta_1_ = state.beta_1(); + beta_2_ = state.beta_2(); +} } // namespace optimizer } // namespace paddle diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h index 840927e6c7d59ee3d51e284fb2593508e812ffbd..cb211c6d88832b2afcc3f3a78ef829661b773f2b 100644 --- a/paddle/optimizer/adam_optimizer.h +++ b/paddle/optimizer/adam_optimizer.h @@ -7,9 +7,12 @@ namespace optimizer { class AdamOptimizer : public ParameterOptimizer { public: - AdamOptimizer( - double beta_1, double beta_2, double epsilon, double decay, LrPolicy *lr) - : ParameterOptimizer(lr), + AdamOptimizer(Tensor *parameter, + LrPolicy *lr double beta_1, + double beta_2, + double epsilon, + double decay) + : ParameterOptimizer(parameter, lr), momentums_(nullptr), velocitys_(nullptr), beta_1_(beta_1), diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h index ab7d1fb0fe6a980df0db52d04bf89f3cfb453147..686ba22671505b7975ebf8e76e13abbc7e67ba3d 100644 --- a/paddle/optimizer/lr_policy.h +++ b/paddle/optimizer/lr_policy.h @@ -10,6 +10,7 @@ class LrPolicy { public: virtual ~LrPolicy() {} virtual double LearningRate(const uint64_t num_sample_passed) = 0; + virtual void set(double current_learning_rate) = 0; }; // constant learning rate policy @@ -19,6 +20,9 @@ public: double LearningRate(const uint64_t num_sample_passed) { return learning_rate; } + void set(double current_learning_rate) { + learning_rate = current_learning_rate; + } private: double learning_rate; @@ -31,6 +35,9 @@ public: double LearningRate(const uint64_t num_sample_passed) { return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b); } + void set(double current_learning_rate) { + learning_rate = current_learning_rate; + } private: double learning_rate; diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc index 9572f163ddeaa72105455a1e034f3ae6d0f39d35..c06c0737b25d7d4979bd483a7b64130893646169 100644 --- a/paddle/optimizer/optimizer.cc +++ b/paddle/optimizer/optimizer.cc @@ -44,13 +44,13 @@ paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto, const int state_len) { paddle_optimizer* optimizer = new paddle_optimizer; std::string config(config_proto, config_proto + config_proto_len); - optimizer->impl = ParameterOptimizer::Create(config); + Tensor* parameter = + new Tensor(reinterpret_cast(param_buffer), num_bytes); + optimizer->impl = ParameterOptimizer::Create(config, parameter); if (state != nullptr) { std::string s(state, state + state_len); optimizer->impl->DeSerializeState(s); } - Tensor* param = new Tensor(reinterpret_cast(param_buffer), num_bytes); - optimizer->impl->set_weight(param); return optimizer; } @@ -77,6 +77,7 @@ int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer) { } int paddle_optimizer_get_state(paddle_optimizer* o, const char** state) { - *state = o->impl->SerializeState(); - return strlen(*state); + int state_len = 0; + *state = o->impl->SerializeState(&state_len); + return state_len; } diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/optimizer/parameter_optimizer.cc index 1a96880c57464f814d8e64f705cb974eb0e4f682..ae3e97bba8d10916908a71d94cf7e905e56c2653 100644 --- a/paddle/optimizer/parameter_optimizer.cc +++ b/paddle/optimizer/parameter_optimizer.cc @@ -10,8 +10,8 @@ namespace paddle { namespace optimizer { -ParameterOptimizer *ParameterOptimizer::Create( - const std::string &config_proto) { +ParameterOptimizer *ParameterOptimizer::Create(const std::string &config_proto, + Tensor *parameter) { paddle::OptimizerConfig config; CHECK(config.ParseFromString(config_proto) == 0) << "failed parse optimizer config"; @@ -29,34 +29,38 @@ ParameterOptimizer *ParameterOptimizer::Create( }; LrPolicy *lr = select_lr_policy(config); auto select_optimizer = - [=](const OptimizerConfig &config) -> ParameterOptimizer * { + [=](Tensor *parameter, + const OptimizerConfig &config) -> ParameterOptimizer * { if (config.optimizer() == OptimizerConfig::SGD) { - return new SGDOptimizer(config.sgd().momentum(), + return new SGDOptimizer(parameter, + lr, + config.sgd().momentum(), config.sgd().decay(), - config.sgd().nesterov(), - lr); + config.sgd().nesterov()); } if (config.optimizer() == OptimizerConfig::Adadelta) { - return new AdadeltaOptimizer(config.adadelta().rho(), + return new AdadeltaOptimizer(parameter, + lr, + config.adadelta().rho(), config.adadelta().epsilon(), - config.adadelta().decay(), - lr); + config.adadelta().decay()); } if (config.optimizer() == OptimizerConfig::Adagrad) { return new AdagradOptimizer( - config.adagrad().epsilon(), config.adagrad().decay(), lr); + parameter, lr, config.adagrad().epsilon(), config.adagrad().decay()); } if (config.optimizer() == OptimizerConfig::Adam) { - return new AdamOptimizer(config.adam().beta_1(), + return new AdamOptimizer(parameter, + lr, + config.adam().beta_1(), config.adam().beta_2(), config.adam().epsilon(), - config.adam().decay(), - lr); + config.adam().decay()); } // default LOG(WARNING) << "have not select any Optimizer. use SGDOptimizer in default"; - return new SGDOptimizer(0.0, 0.0, false, lr); + return new SGDOptimizer(parameter, lr, 0.0, 0.0, false); }; return select_optimizer(config); } @@ -66,7 +70,5 @@ float *ParameterOptimizer::get_weight(int *param_size) const { return parameter_->get_buffer(); } -void ParameterOptimizer::set_weight(Tensor *p) { parameter_ = p; } - } // namespace optimizer } // namespace paddle diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h index f65f1d71a4271b2317d3e5714435cd2e68f3a54c..1abd659d4844bf6a0e99975664d7147da6ac5b14 100644 --- a/paddle/optimizer/parameter_optimizer.h +++ b/paddle/optimizer/parameter_optimizer.h @@ -5,32 +5,35 @@ #include #include "OptimizerConfig.pb.h" #include "lr_policy.h" +#include "serialization.h" #include "tensor.h" +// Not Implemen Yet, macr +// o +#define NIMPL crash(__PRETTY_FUNCTION__, " not implemented yet") + namespace paddle { namespace optimizer { -const std::string kOptimizerVersion = "1.0"; - class ParameterOptimizer { public: /** * @brief update hook for algorithm need to traverse parameter more than * once. */ - ParameterOptimizer(LrPolicy *lr) : lr_policy_(lr), num_sample_passed_(0) {} + ParameterOptimizer(Tensor *parameter, LrPolicy *lr) + : parameter_(parameter), lr_policy_(lr), num_sample_passed_(0) {} virtual ~ParameterOptimizer() { delete parameter_; }; - static ParameterOptimizer *Create(const std::string &config_proto); - virtual const char *SerializeState(); - virtual void DeSerializeState(const std::string &state); + static ParameterOptimizer *Create(const std::string &config_proto, + Tensor *parameter); virtual void Update(const Tensor *gradient) = 0; virtual float *get_weight(int *param_size) const; - virtual void set_weight(Tensor *parameter); + virtual const char *SerializeState(int *state_len) = 0; + virtual void DeSerializeState(const std::string &state) = 0; protected: Tensor *parameter_; - // learning rate policy LrPolicy *lr_policy_; uint64_t num_sample_passed_; diff --git a/paddle/optimizer/serialization.h b/paddle/optimizer/serialization.h index 6caa514cef25eb3fb1a3a7efe972063a3d378416..a330dd96e93224ebba275d71eb899aadbc4549d2 100644 --- a/paddle/optimizer/serialization.h +++ b/paddle/optimizer/serialization.h @@ -2,6 +2,7 @@ #include #include +#include #include "OptimizerConfig.pb.h" #include "paddle/utils/Logging.h" #include "tensor.h" @@ -9,6 +10,17 @@ namespace paddle { namespace optimizer { +inline unsigned CalStateSize(int* state_len) { return 0; } + +template +unsigned CalStateSize(const HEAD& head, const TAIL&... tail) { + if (std::is_fundamental::value) { + return sizeof head + CalStateSize(tail...); + } else { + return sizeof(head[0] * head->size()) + CalStateSize(tail...); + } +} + static void TensorToProto(const Tensor& tensor, TensorProto* proto) { proto->set_data_type(TensorProto::PADDLE_ELEMENT_TYPE_FLOAT32); proto->set_size(tensor.size()); diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h index 10b46db38686a25b5e5742ddd0b1642510d89b29..d0ac375d2b8f13383a4281db81e4d832dbbf6b8e 100644 --- a/paddle/optimizer/sgd_optimizer.h +++ b/paddle/optimizer/sgd_optimizer.h @@ -7,20 +7,26 @@ namespace optimizer { class SGDOptimizer : public ParameterOptimizer { public: - SGDOptimizer(double m, double d, bool n, LrPolicy* lr) - : ParameterOptimizer(lr), + SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n) + : ParameterOptimizer(parameter, lr), momentums_(nullptr), momentum_(m), decay_(d), - nesterov_(n) {} - virtual ~SGDOptimizer() { delete momentums_; } + nesterov_(n) { + if (momentum_ != 0.0) { + size_t size = p->size(); + // TODO: fix it with align aware allocator bind to Tensor + if (momentums_) delete momentums_; + momentums_ = new Tensor(size); + } + } + virtual ~SGDOptimizer() { + if (momentums_) delete momentums_; + } void Update(const Tensor* gradient); - const char* SerializeState(); + const char* SerializeState(int* state_len); void DeSerializeState(const std::string& state); - void set_weight(Tensor* p); - float* get_weight(int* param_size) const; - private: Tensor* momentums_; double momentum_; diff --git a/paddle/optimizer/sgd_optmizer.cc b/paddle/optimizer/sgd_optmizer.cc index fea550f8d1d7c4c0051a12e880ce815c9e74ef43..b2c6b7a1acf2d6456565b3fd2c06caa9875c67c1 100644 --- a/paddle/optimizer/sgd_optmizer.cc +++ b/paddle/optimizer/sgd_optmizer.cc @@ -4,15 +4,6 @@ namespace paddle { namespace optimizer { -void SGDOptimizer::set_weight(Tensor *p) { - parameter_ = p; - if (momentum_ != 0.0) { - size_t size = p->size(); - // TODO: fix it with align aware allocator bind to Tensor - momentums_ = new Tensor(size); - } -} - void SGDOptimizer::Update(const Tensor *gradient) { num_sample_passed_ += 1; double learning_rate = lr_policy_->LearningRate(num_sample_passed_); @@ -36,28 +27,30 @@ void SGDOptimizer::Update(const Tensor *gradient) { } } -const char *SGDOptimizer::SerializeState() { +const char *SGDOptimizer::SerializeState(int *state_len) { OptimizerState state; - // version is a global const value - state.set_version(kOptimizerVersion); - TensorToProto(*parameter_, state.add_data()); - TensorToProto(*momentums_, state.add_data()); - state.add_hyperparam(momentum_); + state.set_learning_rate(lr_policy_->LearningRate(num_sample_passed_)); + state.set_num_sample_passed(num_sample_passed_); + + TensorToProto(*parameter_, state.mutable_parameter()); + TensorToProto(*momentums_, state.mutable_momentums()); + state.set_momentum(momentum_); + state.set_decay(decay_); + state.set_nesterov(nesterov_); + *state_len += + CalStateSize(parameter_, momentums_, momentum_, decay_, nesterov_); return state.SerializeAsString().c_str(); } void SGDOptimizer::DeSerializeState(const std::string &str) { OptimizerState state; state.ParseFromString(str); - CHECK(state.version() == kOptimizerVersion) - << "error version of state" - << "expected : " << kOptimizerVersion << "get : " << state.version(); + lr_policy_->set(state.learning_rate()); + num_sample_passed_ = state.num_sample_passed(); - ProtoToTensor(state.data(0), parameter_); - if (state.data_size() == 2) { - ProtoToTensor(state.data(1), momentums_); - momentum_ = state.hyperparam(0); - } + ProtoToTensor(state.parameter(), parameter_); + ProtoToTensor(state.parameter(), momentums_); + momentum_ = state.momentum(); } } // namespace optimizer diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto index 3986fce5da696ca4fea2df24d7f9e5ba0efc76d1..1ccba6d207612162a76756b82af196de2e3fbfd6 100644 --- a/proto/OptimizerConfig.proto +++ b/proto/OptimizerConfig.proto @@ -5,13 +5,14 @@ option optimize_for = LITE_RUNTIME; package paddle; message SGDConfig { - // SGD + // SGD // momentum: float >= 0. Parameter updates momentum. // decay: float >= 0. Learning rate decay over each update. // nesterov: boolean. Whether to apply Nesterov momentum. optional double momentum = 21 [default = 0.0]; optional double decay = 23 [default = 0.0]; optional bool nesterov =24 [default = false]; + } @@ -75,14 +76,38 @@ enum DataType { } required DataType data_type = 1; repeated bytes content = 2; - optional uint64 size = 3; } message OptimizerState { - // match old training state with format parser - required string version = 100; - repeated TensorProto data = 1; - repeated double hyperparam = 3; + optional double learning_rate = 101; + optional double lr_decay_a = 102; + optional double lr_decay_b = 103; + optional double num_sample_passed = 104; + // momentum + optional TensorProto parameter = 105; + optional TensorProto momentums = 1; + + // adadelta + optional TensorProto accum_gradient = 2; + optional TensorProto accum_delta = 3; + optional TensorProto update_delta = 4; + + // adam + optional TensorProto velocitys = 5; + + // momentum + optional double momentum = 6; + optional double decay = 7; + optional bool nesterov = 8; + + // adadelta + optional double rho = 9; + optional double epsilon = 10; + + // adam + optional double beta_1 = 11; + optional double beta_2 = 12; + } message OptimizerConfig {