diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt index 732c5b88233a0933358f0252cf67650650a1980c..0f6c4eb2dd1de3b804825ca11a6192909366a814 100644 --- a/paddle/optimizer/CMakeLists.txt +++ b/paddle/optimizer/CMakeLists.txt @@ -9,7 +9,7 @@ set(OPITMIZER_SRCS sgd_optmizer.cc ) -set(OPITMIZER_Headers +set(OPITMIZER_HEADERS adadelta_optimizer.h adagrad_optimizer.h adam_optimizer.h @@ -17,12 +17,12 @@ set(OPITMIZER_Headers optimizer.h parameter_optimizer.h sgd_optimizer.h - Tensor.h + tensor.h ) add_library(optimizer STATIC ${OPITMIZER_SRCS}) add_dependencies(optimizer gen_proto_cpp) -add_simple_unittest(Tensor_test) +add_simple_unittest(tensor_test) add_simple_unittest(parameter_optimizer_test) add_dependencies(parameter_optimizer_test optimizer) diff --git a/paddle/optimizer/Tensor.h b/paddle/optimizer/Tensor.h index 41afacd75659f71d98eedab0cd69d0f66b50a27c..a887005cf41d8af24d902acf6b2236daff1cc12c 100644 --- a/paddle/optimizer/Tensor.h +++ b/paddle/optimizer/Tensor.h @@ -32,7 +32,7 @@ public: return data_[idx]; } // TODO: replace with tensorshape - size_t size() const { return this->width_; } + size_t size() const { return this->width_ * this->height_; } protected: size_t height_; diff --git a/paddle/optimizer/Tensor_test.cpp b/paddle/optimizer/Tensor_test.cpp index 4b7059f9943a9432fd36566141893216df5b1fca..3a21b6d3032f4e64dbf80539eb5572602cb127ac 100644 --- a/paddle/optimizer/Tensor_test.cpp +++ b/paddle/optimizer/Tensor_test.cpp @@ -1,6 +1,6 @@ -#include "Tensor.h" #include #include "gtest/gtest.h" +#include "tensor.h" using namespace paddle; using namespace paddle::optimizer; @@ -13,6 +13,7 @@ TEST(Tensor, indexer) { } ASSERT_EQ(t[2], 2); ASSERT_EQ(t[1], 1); + delete ptr; } int main(int argc, char** argv) { diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc index f10ee1bcd4beffaa7fad0aaba78870a51d3f4944..7381f9d40e7fdec7c595ac8bb0db26ed96f38c16 100644 --- a/paddle/optimizer/adadelta_optimizer.cc +++ b/paddle/optimizer/adadelta_optimizer.cc @@ -6,32 +6,33 @@ namespace paddle { namespace optimizer { void AdadeltaOptimizer::set_weight(Tensor* p) { + parameter_ = p; size_t size = p->size(); real* gptr = new real[size]; - accum_gradient = new Tensor(gptr, size); + accum_gradient_ = new Tensor(gptr, size); real* dptr = new real[size]; - accum_delta = new Tensor(dptr, size); + accum_delta_ = new Tensor(dptr, size); real* dptr_current = new real[size]; - update_delta = new Tensor(dptr_current, size); + update_delta_ = new Tensor(dptr_current, size); } -void AdadeltaOptimizer::update(const Tensor* gradient) { - num_sample_passed += 1; - double learning_rate = lr_policy->get_learning_rate(num_sample_passed); +void AdadeltaOptimizer::Update(const Tensor* gradient) { + num_sample_passed_ += 1; + double learning_rate = lr_policy_->LearningRate(num_sample_passed_); Tensor& param = *parameter_; const Tensor& grad = *gradient; - Tensor& accum_g = *accum_gradient; - Tensor& accum_d = *accum_delta; - Tensor& update_d = *update_delta; + Tensor& accum_g = *accum_gradient_; + Tensor& accum_d = *accum_delta_; + Tensor& update_d = *update_delta_; for (size_t i = 0; i < param.size(); ++i) { - accum_g[i] = rho * accum_g[i] + (1.0 - rho) * grad[i] * grad[i]; + accum_g[i] = rho_ * accum_g[i] + (1.0 - rho_) * grad[i] * grad[i]; - update_d[i] = std::sqrt(accum_d[i] + epsilon) / - std::sqrt(accum_g[i] + epsilon) * grad[i]; + update_d[i] = std::sqrt(accum_d[i] + epsilon_) / + std::sqrt(accum_g[i] + epsilon_) * grad[i]; - accum_d[i] = rho * accum_d[i] + (1.0 - rho) * update_d[i] * update_d[i]; + accum_d[i] = rho_ * accum_d[i] + (1.0 - rho_) * update_d[i] * update_d[i]; - param[i] -= learning_rate * update_d[i] + learning_rate * decay * param[i]; + param[i] -= learning_rate * update_d[i] + learning_rate * decay_ * param[i]; } } } // namespace optimizer diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h index 1d8bd5a654c66a3d2ad247a8ba58be6c5355baa0..20801c3794b6c5a67cf64c6d2684cb33bf54273d 100644 --- a/paddle/optimizer/adadelta_optimizer.h +++ b/paddle/optimizer/adadelta_optimizer.h @@ -8,29 +8,25 @@ namespace optimizer { class AdadeltaOptimizer : public ParameterOptimizer { public: - using ParameterOptimizer::parameter_; - using ParameterOptimizer::num_sample_passed; - using ParameterOptimizer::lr_policy; - - AdadeltaOptimizer(double rho, double epsilon, double decay, BaseLr *lr) - : ParameterOptimizer(lr), rho(rho), epsilon(epsilon), decay(decay) {} + AdadeltaOptimizer(double rho, double epsilon, double decay, LrPolicy *lr) + : ParameterOptimizer(lr), rho_(rho), epsilon_(epsilon), decay_(decay) {} ~AdadeltaOptimizer() { - if (accum_gradient) delete accum_gradient; - if (accum_delta) delete accum_delta; - if (update_delta) delete update_delta; + if (accum_gradient_) delete accum_gradient_; + if (accum_delta_) delete accum_delta_; + if (update_delta_) delete update_delta_; } - void update(const Tensor *gradient); + void Update(const Tensor *gradient); void set_weight(Tensor *p); real *get_weight() const; private: - Tensor *accum_gradient; - Tensor *accum_delta; - Tensor *update_delta; + Tensor *accum_gradient_; + Tensor *accum_delta_; + Tensor *update_delta_; - double rho; - double epsilon; - double decay; + double rho_; + double epsilon_; + double decay_; }; } // namespace optimizer diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc index 437bd4682d5aac726143cc7a284b3965b337be53..e3a9960e150b646e71e57b6d0ff4a207638e740b 100644 --- a/paddle/optimizer/adagrad_optimizer.cc +++ b/paddle/optimizer/adagrad_optimizer.cc @@ -6,21 +6,22 @@ namespace paddle { namespace optimizer { void AdagradOptimizer::set_weight(Tensor* p) { + parameter_ = p; size_t size = p->size(); real* gptr = new real[size]; - accum_gradient = new Tensor(gptr, size); + accum_gradient_ = new Tensor(gptr, size); } -void AdagradOptimizer::update(const Tensor* gradient) { - num_sample_passed += 1; - double learning_rate = lr_policy->get_learning_rate(num_sample_passed); +void AdagradOptimizer::Update(const Tensor* gradient) { + num_sample_passed_ += 1; + double learning_rate = lr_policy_->LearningRate(num_sample_passed_); Tensor& param = *parameter_; + Tensor& accum_g = *accum_gradient_; const Tensor& grad = *gradient; - Tensor& accum_g = *accum_gradient; for (size_t i = 0; i < param.size(); ++i) { accum_g[i] += grad[i] * grad[i]; - param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon) + - learning_rate * decay * param[i]; + param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon_) + + learning_rate * decay_ * param[i]; } } diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h index aa5f74ffcdf9beaf8e1525bd383cdbf9c6ba92fe..bb64d7d5a778431ec584773a72f539ce32eeae29 100644 --- a/paddle/optimizer/adagrad_optimizer.h +++ b/paddle/optimizer/adagrad_optimizer.h @@ -8,19 +8,19 @@ namespace optimizer { class AdagradOptimizer : public ParameterOptimizer { public: - AdagradOptimizer(double epsilon, double decay, BaseLr *lr) - : ParameterOptimizer(lr), epsilon(epsilon), decay(decay) {} + AdagradOptimizer(double epsilon, double decay, LrPolicy *lr) + : ParameterOptimizer(lr), epsilon_(epsilon), decay_(decay) {} ~AdagradOptimizer() { - if (accum_gradient) delete accum_gradient; + if (accum_gradient_) delete accum_gradient_; } - void update(const Tensor *gradient); + void Update(const Tensor *gradient); void set_weight(Tensor *p); real *get_weight() const; private: - Tensor *accum_gradient; - double epsilon; - double decay; + Tensor *accum_gradient_; + double epsilon_; + double decay_; }; } // namespace optimizer diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc index 6b3f275bf0641624d5d6ed0bded04ff38d880cdc..ae96b30b948ccf3442e4c57fdb9334024d58c218 100644 --- a/paddle/optimizer/adam_optimizer.cc +++ b/paddle/optimizer/adam_optimizer.cc @@ -5,6 +5,7 @@ namespace paddle { namespace optimizer { void AdamOptimizer::set_weight(Tensor *p) { + parameter_ = p; size_t size = p->size(); real *mptr = new real[size]; momentums_ = new Tensor(mptr, size); @@ -12,21 +13,21 @@ void AdamOptimizer::set_weight(Tensor *p) { velocitys_ = new Tensor(vptr, size); } -void AdamOptimizer::update(const Tensor *gradient) { - num_sample_passed += 1; - double learning_rate = lr_policy->get_learning_rate(num_sample_passed); - double coef1 = 1.0 - std::pow(beta_1, num_sample_passed); - double coef2 = 1.0 - std::pow(beta_2, num_sample_passed); +void AdamOptimizer::Update(const Tensor *gradient) { + num_sample_passed_ += 1; + double learning_rate = lr_policy_->LearningRate(num_sample_passed_); + double coef1 = 1.0 - std::pow(beta_1_, num_sample_passed_); + double coef2 = 1.0 - std::pow(beta_2_, num_sample_passed_); learning_rate *= std::sqrt(coef2) / coef1; Tensor ¶m = *parameter_; const Tensor &grad = *gradient; Tensor &m = *momentums_; Tensor &v = *velocitys_; for (size_t i = 0; i < param.size(); ++i) { - m[i] = beta_1 * m[i] + (1.0 - beta_1) * grad[i]; - v[i] = beta_2 * v[i] + (1.0 - beta_2) * grad[i] * grad[i]; + m[i] = beta_1_ * m[i] + (1.0 - beta_1_) * grad[i]; + v[i] = beta_2_ * v[i] + (1.0 - beta_2_) * grad[i] * grad[i]; param[i] -= - learning_rate * (m[i] / std::sqrt(v[i] + epsilon) + decay * param[i]); + learning_rate * (m[i] / std::sqrt(v[i] + epsilon_) + decay_ * param[i]); } } } // namespace optimizer diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h index 55a44b032df87cb56d7b89a2c924719ed5b62bc1..89e68346d58a5063eb50f7cc7e0f9544b55ccf6c 100644 --- a/paddle/optimizer/adam_optimizer.h +++ b/paddle/optimizer/adam_optimizer.h @@ -9,27 +9,27 @@ namespace optimizer { class AdamOptimizer : public ParameterOptimizer { public: AdamOptimizer( - double beta_1, double beta_2, double epsilon, double decay, BaseLr *lr) + double beta_1, double beta_2, double epsilon, double decay, LrPolicy *lr) : ParameterOptimizer(lr), - beta_1(beta_1), - beta_2(beta_2), - epsilon(epsilon), - decay(decay) {} + beta_1_(beta_1), + beta_2_(beta_2), + epsilon_(epsilon), + decay_(decay) {} ~AdamOptimizer() { if (momentums_) delete momentums_; if (velocitys_) delete velocitys_; } - void update(const Tensor *gradient); + void Update(const Tensor *gradient); void set_weight(Tensor *p); real *get_weight() const; private: Tensor *momentums_; Tensor *velocitys_; - double beta_1; - double beta_2; - double epsilon; - double decay; + double beta_1_; + double beta_2_; + double epsilon_; + double decay_; }; } // namespace optimizer diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h index e1017cf32dcfc8b5bcd4591e971cff2a5ccd5b58..b24a17ced01b0851ffd66147b6265cbd63ce2337 100644 --- a/paddle/optimizer/lr_policy.h +++ b/paddle/optimizer/lr_policy.h @@ -7,34 +7,34 @@ namespace paddle { namespace optimizer { -class BaseLr { +class LrPolicy { public: - BaseLr(double lr) : learning_rate(lr) {} - virtual ~BaseLr() {} - virtual double get_learning_rate(const uint64_t num_sample_passed) = 0; - -protected: - double learning_rate; + virtual ~LrPolicy() {} + virtual double LearningRate(const uint64_t num_sample_passed) = 0; }; // constant learning rate policy -class ConstLr final : public BaseLr { +class ConstLr final : public LrPolicy { public: - ConstLr(double lr) : BaseLr(lr){}; - double get_learning_rate(const uint64_t num_sample_passed) { + ConstLr(double lr) : learning_rate(lr){}; + double LearningRate(const uint64_t num_sample_passed) { return learning_rate; } + +protected: + double learning_rate; }; -class LinearLr final : public BaseLr { +class LinearLr final : public LrPolicy { public: LinearLr(double lr, double lr_decay_a, double lr_decay_b) - : BaseLr(lr), lr_decay_a(lr_decay_a), lr_decay_b(lr_decay_b) {} - double get_learning_rate(const uint64_t num_sample_passed) { + : learning_rate(lr), lr_decay_a(lr_decay_a), lr_decay_b(lr_decay_b) {} + double LearningRate(const uint64_t num_sample_passed) { return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b); } private: + double learning_rate; double lr_decay_a; double lr_decay_b; }; diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc index 10b3339c2d5a6f57a45f3e93d7badd52bc08c42b..e9bcdcd80163ee1f05fab491cff858137bed3f76 100644 --- a/paddle/optimizer/optimizer.cc +++ b/paddle/optimizer/optimizer.cc @@ -37,7 +37,7 @@ paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto, int config_proto_len) { paddle_optimizer* optimizer = new paddle_optimizer; std::string config(config_proto, config_proto + config_proto_len); - optimizer->impl = ParameterOptimizer::create(config); + optimizer->impl = ParameterOptimizer::Create(config); return optimizer; } @@ -53,7 +53,7 @@ int paddle_update_parameter(paddle_optimizer* o, // TOOD(zhihong): datatype not work. need to add the runtime datatype auto grad_type = reinterpret_cast(grad_buffer); Tensor* gradient = new Tensor(const_cast(grad_type), num_bytes); - o->impl->update(gradient); + o->impl->Update(gradient); return PADDLE_SUCCESS; } diff --git a/paddle/optimizer/optimizer_test.cpp b/paddle/optimizer/optimizer_test.cpp deleted file mode 100644 index 1bdc6f40fcaa273da93d084697d954f91b072152..0000000000000000000000000000000000000000 --- a/paddle/optimizer/optimizer_test.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "optimizer.h" -#include "gtest/gtest.h" - -template -class Opitmizer_C_Test : public testing::Test { -private: - Tensor parameter; - Tensor gradient; -}; - -void applyGradientDescent_TEST() {} diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/optimizer/parameter_optimizer.cc index cbdccd973cec1d4b12d2c77fe0a9d25945c74607..00e9b858551bbab95a234a9da46fb383b5d50778 100644 --- a/paddle/optimizer/parameter_optimizer.cc +++ b/paddle/optimizer/parameter_optimizer.cc @@ -10,41 +10,40 @@ namespace paddle { namespace optimizer { -ParameterOptimizer *ParameterOptimizer::create( - const ::std::string &config_proto) { +ParameterOptimizer *ParameterOptimizer::Create( + const std::string &config_proto) { paddle::OptimizerConfig config; CHECK(config.ParseFromString(config_proto) == 0) - << "error : optimizer config"; + << "failed parse optimizer config"; - auto select_lr_policy = [=](const OptimizerConfig &config) -> BaseLr * { - std::string s(config.lr_policy()); - if (s == "ConstLr") return new ConstLr(config.const_lr().learning_rate()); - if (s == "LinearLr") + auto select_lr_policy = [=](const OptimizerConfig &config) -> LrPolicy * { + if (config.lr_policy() == OptimizerConfig::ConstLr) + return new ConstLr(config.const_lr().learning_rate()); + if (config.lr_policy() == OptimizerConfig::LinearLr) return new LinearLr(config.linear_lr().learning_rate(), config.linear_lr().lr_decay_a(), config.linear_lr().lr_decay_b()); // default return nullptr; }; - BaseLr *lr = select_lr_policy(config); + LrPolicy *lr = select_lr_policy(config); auto select_optimizer = [=](const OptimizerConfig &config) -> ParameterOptimizer * { - std::string s(config.optimizer_name()); - if (s == "SGD") { + if (config.optimizer() == OptimizerConfig::SGD) { return new SGDOptimizer(config.sgd().momentum(), config.sgd().decay(), config.sgd().nesterov(), lr); } - if (s == "Adadelta") { + if (config.optimizer() == OptimizerConfig::Adadelta) { return new AdagradOptimizer( config.adagrad().epsilon(), config.adagrad().decay(), lr); } - if (s == "Adagrad") { + if (config.optimizer() == OptimizerConfig::Adagrad) { return new AdagradOptimizer( config.adagrad().epsilon(), config.adagrad().decay(), lr); } - if (s == "Adam") { + if (config.optimizer() == OptimizerConfig::Adam) { return new AdadeltaOptimizer(config.adadelta().rho(), config.adadelta().epsilon(), config.adadelta().decay(), diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h index 42e460b676229a66ecace12c86d028d27d38dbef..69e964069b4eb08bbf57bbadaf5158bd04609354 100644 --- a/paddle/optimizer/parameter_optimizer.h +++ b/paddle/optimizer/parameter_optimizer.h @@ -5,8 +5,8 @@ #include #include #include "OptimizerConfig.pb.h" -#include "Tensor.h" #include "lr_policy.h" +#include "tensor.h" namespace paddle { namespace optimizer { @@ -17,21 +17,21 @@ public: * @brief update hook for algorithm need to traverse parameter more than * once. */ - ParameterOptimizer(BaseLr *lr) : lr_policy(lr), num_sample_passed(0) {} + ParameterOptimizer(LrPolicy *lr) : lr_policy_(lr), num_sample_passed_(0) {} virtual ~ParameterOptimizer() { delete parameter_; }; - static ParameterOptimizer *create(const ::std::string &config_proto); - virtual void update(const Tensor *gradient) = 0; + static ParameterOptimizer *Create(const std::string &config_proto); + virtual void Update(const Tensor *gradient) = 0; virtual real *get_weight() const; virtual void set_weight(Tensor *parameter); -public: +protected: OptimizerConfig config_; Tensor *parameter_; // learning rate policy - BaseLr *lr_policy; - uint64_t num_sample_passed; + LrPolicy *lr_policy_; + uint64_t num_sample_passed_; }; } // namespace optimizer diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cpp index 742e7ec965584c4f7496be8c8b255686f901f4f1..cc791483431298f29bfc806dfeb3fde768d96952 100644 --- a/paddle/optimizer/parameter_optimizer_test.cpp +++ b/paddle/optimizer/parameter_optimizer_test.cpp @@ -42,28 +42,28 @@ public: virtual void TearDown() {} void create_sgd() { - config.set_optimizer_name("SGD"); + config.set_optimizer(OptimizerConfig::SGD); config.mutable_sgd()->set_momentum(0.0); config.mutable_sgd()->set_decay(0.0); config.mutable_sgd()->set_nesterov(false); - config.set_lr_policy("ConstLr"); + config.set_lr_policy(OptimizerConfig::ConstLr); config.mutable_const_lr()->set_learning_rate(0.1); ParameterOptimizer* opt = - ParameterOptimizer::create(config.SerializeAsString()); + ParameterOptimizer::Create(config.SerializeAsString()); opts.push_back(opt); } void create_adam() { - config.set_optimizer_name("Adam"); + config.set_optimizer(OptimizerConfig::Adam); config.mutable_adam()->set_beta_1(0.9); config.mutable_adam()->set_beta_2(0.1); config.mutable_adam()->set_epsilon(1e-3); config.mutable_adam()->set_decay(0.0); - config.set_lr_policy("ConstLr"); + config.set_lr_policy(OptimizerConfig::ConstLr); config.mutable_const_lr()->set_learning_rate(0.1); ParameterOptimizer* opt = - ParameterOptimizer::create(config.SerializeAsString()); + ParameterOptimizer::Create(config.SerializeAsString()); opts.push_back(opt); } void test_set_weight() { @@ -88,7 +88,7 @@ public: void test_update() { Tensor* g = fix_n_Tensor(size); for (size_t i = 0; i < opts.size(); ++i) { - opts[i]->update(g); + opts[i]->Update(g); } } diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h index 4eb483c0fbd1e34b9393a9beacd67ab73c4ec3e7..1f6728d61e31be9fea05b9b4d0df514efed2df21 100644 --- a/paddle/optimizer/sgd_optimizer.h +++ b/paddle/optimizer/sgd_optimizer.h @@ -8,23 +8,19 @@ namespace optimizer { class SGDOptimizer : public ParameterOptimizer { public: - using ParameterOptimizer::parameter_; - using ParameterOptimizer::num_sample_passed; - using ParameterOptimizer::lr_policy; - - SGDOptimizer(double m, double d, bool n, BaseLr* lr) - : ParameterOptimizer(lr), momentum(m), decay(d), nesterov(n) {} + SGDOptimizer(double m, double d, bool n, LrPolicy* lr) + : ParameterOptimizer(lr), momentum_(m), decay_(d), nesterov_(n) {} virtual ~SGDOptimizer() { delete momentums_; } - void update(const Tensor* gradient); + void Update(const Tensor* gradient); void set_weight(Tensor* p); real* get_weight() const; private: Tensor* momentums_; - double momentum; - double decay; - bool nesterov; + double momentum_; + double decay_; + bool nesterov_; }; } // namespace optimizer diff --git a/paddle/optimizer/sgd_optmizer.cc b/paddle/optimizer/sgd_optmizer.cc index 5fdfc89c1f8cf9854c3d6c4f5834f202887ff8c1..c58ab5bbe2b5d64f3281dd8f0fea04802ec90082 100644 --- a/paddle/optimizer/sgd_optmizer.cc +++ b/paddle/optimizer/sgd_optmizer.cc @@ -5,31 +5,32 @@ namespace optimizer { void SGDOptimizer::set_weight(Tensor *p) { // ParameterOptimizer::set_weight(p); + parameter_ = p; size_t size = p->size(); // TODO: fix it with align aware allocator bind to Tensor - if (momentum != 0.0) { + if (momentum_ != 0.0) { real *ptr = new real[size]; momentums_ = new Tensor(ptr, size); } } -void SGDOptimizer::update(const Tensor *gradient) { - num_sample_passed += 1; - double learning_rate = lr_policy->get_learning_rate(num_sample_passed); +void SGDOptimizer::Update(const Tensor *gradient) { + num_sample_passed_ += 1; + double learning_rate = lr_policy_->LearningRate(num_sample_passed_); real velocity = 0.0; Tensor ¶m = *parameter_; const Tensor &grad = *gradient; Tensor &m = *momentums_; for (size_t i = 0; i < param.size(); ++i) { - if (momentum == 0.0) { - velocity = -learning_rate * grad[i] - learning_rate * decay * param[i]; + if (momentum_ == 0.0) { + velocity = -learning_rate * grad[i] - learning_rate * decay_ * param[i]; } else { - m[i] = momentum * m[i] - learning_rate * grad[i] - - learning_rate * decay * param[i]; + m[i] = momentum_ * m[i] - learning_rate * grad[i] - + learning_rate * decay_ * param[i]; velocity = m[i]; } - if (nesterov) { - param[i] += momentum * velocity - learning_rate * grad[i]; + if (nesterov_) { + param[i] += momentum_ * velocity - learning_rate * grad[i]; } else { param[i] += velocity; } diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto index 0d7b3407b8e8d3a9e63290e8f77a2247b0c1d376..5dd26373379223c243b130a3e14c49a97b4e2182 100644 --- a/proto/OptimizerConfig.proto +++ b/proto/OptimizerConfig.proto @@ -54,38 +54,40 @@ message AdamConfig { message ConstLr { // learninRate Policy - required double learning_rate = 40 [default = 1.0]; + required double learning_rate = 1 [default = 1.0]; } message LinearLr { // learninRate Policy - required double learning_rate = 40 [default = 1.0]; - optional double lr_decay_a = 25; - optional double lr_decay_b = 26; + required double learning_rate = 1 [default = 1.0]; + optional double lr_decay_a = 2; + optional double lr_decay_b = 3; } message OptimizerConfig { // common config of optimizer // algorithm config, type : string - // SGD = 1; - // Adadelta = 2; - // Adagrad = 3; - // Adam = 4; - required string optimizer_name = 1; + enum Optimizer { + SGD = 1; + Adadelta = 2; + Adagrad = 3; + Adam = 4; + } + required Optimizer optimizer = 1; optional SGDConfig sgd = 3; optional AdadeltaConfig adadelta = 4; optional AdagradConfig adagrad = 5; optional AdamConfig adam = 6; // learning rate runtime policy config - // lr_policy , type : string - // ConstLr = 0; - // LinearLr = 1; - required string lr_policy = 11; + enum LrPolicy { + ConstLr = 0; + LinearLr = 1; + } + required LrPolicy lr_policy = 11; optional ConstLr const_lr = 12; - optional LinearLr linear_lr = 15; - optional uint64 num_sample_passed = 13 [default = 0]; + optional LinearLr linear_lr = 13; // common config of optimizer optional double clipnorm = 101;