diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt index 95d7ad720f3c6ab01ef3c8cf0987b710fa39c6ab..192d0756202736a1ef2173676a8aedd6892522c7 100644 --- a/paddle/optimizer/CMakeLists.txt +++ b/paddle/optimizer/CMakeLists.txt @@ -1,23 +1,21 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) set(OPITMIZER_SRCS - # adadelta_optimizer.cc - # adagrad_optimizer.cc - # adam_optimizer.cc + adadelta_optimizer.cc + adagrad_optimizer.cc + adam_optimizer.cc optimizer.cc parameter_optimizer.cc sgd_optmizer.cc - regularizer.cc ) set(OPITMIZER_Headers - # adadelta_optimizer.h - # adagrad_optimizer.h - # adam_optimizer.h + adadelta_optimizer.h + adagrad_optimizer.h + adam_optimizer.h lr_policy.h optimizer.h parameter_optimizer.h - regularizer.h sgd_optimizer.h Tensor.h ) diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc index c5537bde8534c9ce84e4c89bd6637bac9c265047..f10ee1bcd4beffaa7fad0aaba78870a51d3f4944 100644 --- a/paddle/optimizer/adadelta_optimizer.cc +++ b/paddle/optimizer/adadelta_optimizer.cc @@ -1,5 +1,6 @@ #include "adadelta_optimizer.h" #include +#include namespace paddle { namespace optimizer { @@ -7,28 +8,30 @@ namespace optimizer { void AdadeltaOptimizer::set_weight(Tensor* p) { size_t size = p->size(); real* gptr = new real[size]; - accum_gradient = Tensor(gptr, size); + accum_gradient = new Tensor(gptr, size); real* dptr = new real[size]; - accum_delta = Tensor(dptr, size); + accum_delta = new Tensor(dptr, size); real* dptr_current = new real[size]; - update_delta = Tensor(dptr_current, size); + update_delta = new Tensor(dptr_current, size); } -void AdadeltaOptimizer::update(const Tensor& gradient) { +void AdadeltaOptimizer::update(const Tensor* gradient) { num_sample_passed += 1; double learning_rate = lr_policy->get_learning_rate(num_sample_passed); - for (size_t i = 0; i < parameter_->size(); ++i) { - accum_gradient[i] = - rho * accum_gradient[i] + (1.0 - rho) * gradient[i] * gradient[i]; + Tensor& param = *parameter_; + const Tensor& grad = *gradient; + Tensor& accum_g = *accum_gradient; + Tensor& accum_d = *accum_delta; + Tensor& update_d = *update_delta; + for (size_t i = 0; i < param.size(); ++i) { + accum_g[i] = rho * accum_g[i] + (1.0 - rho) * grad[i] * grad[i]; - update_delta[i] = std::sqrt(accum_delta[i] + epsilon) / - std::sqrt(accum_gradient[i] + epsilon) * gradient[i]; + update_d[i] = std::sqrt(accum_d[i] + epsilon) / + std::sqrt(accum_g[i] + epsilon) * grad[i]; - accum_delta[i] = - rho * accum_delta[i] + (1.0 - rho) * update_delta[i] * update_delta[i]; + accum_d[i] = rho * accum_d[i] + (1.0 - rho) * update_d[i] * update_d[i]; - parameter_[i] -= - learning_rate * update_delta[i] + learning_rate * decay * parameter_[i]; + param[i] -= learning_rate * update_d[i] + learning_rate * decay * param[i]; } } } // namespace optimizer diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h index d9db5d09c2269ef4d830a4fa4cf65c99c4095251..1d8bd5a654c66a3d2ad247a8ba58be6c5355baa0 100644 --- a/paddle/optimizer/adadelta_optimizer.h +++ b/paddle/optimizer/adadelta_optimizer.h @@ -19,7 +19,7 @@ public: if (accum_delta) delete accum_delta; if (update_delta) delete update_delta; } - void update(const Tensor &gradient); + void update(const Tensor *gradient); void set_weight(Tensor *p); real *get_weight() const; diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc index 3d47e35896ce84e0e0033a4af00c3fdba089ed24..437bd4682d5aac726143cc7a284b3965b337be53 100644 --- a/paddle/optimizer/adagrad_optimizer.cc +++ b/paddle/optimizer/adagrad_optimizer.cc @@ -1,26 +1,26 @@ +#include + #include "adagrad_optimizer.h" namespace paddle { namespace optimizer { void AdagradOptimizer::set_weight(Tensor* p) { - size_t size = p->width(); + size_t size = p->size(); real* gptr = new real[size]; - accum_gradient = Tensor(gptr, size); - real* dptr = new real[size]; - accum_delta = Tensor(dtpr, size); - real* dptr_current = new real[size]; - update_delta = Tensor(dptr_current, size); + accum_gradient = new Tensor(gptr, size); } -void AdagradOptimizer::update(const Tensor& gradient) { +void AdagradOptimizer::update(const Tensor* gradient) { num_sample_passed += 1; - double learning_rate = lr_policy->get_learning_rate(); - for (size_t i = 0; i < parameter_.size(); ++i) { - accum_gradient[i] += gradient[i] * gradient[i]; - parameter_[i] += - learning_rate * (gradient[i] / std::sqrt(accum_gradient[i] + epsilon) + - decay * parameter_[i]); + double learning_rate = lr_policy->get_learning_rate(num_sample_passed); + Tensor& param = *parameter_; + const Tensor& grad = *gradient; + Tensor& accum_g = *accum_gradient; + for (size_t i = 0; i < param.size(); ++i) { + accum_g[i] += grad[i] * grad[i]; + param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon) + + learning_rate * decay * param[i]; } } diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h index 0f6ce06f35a076147691785327a6c0ff4a26362b..aa5f74ffcdf9beaf8e1525bd383cdbf9c6ba92fe 100644 --- a/paddle/optimizer/adagrad_optimizer.h +++ b/paddle/optimizer/adagrad_optimizer.h @@ -13,7 +13,7 @@ public: ~AdagradOptimizer() { if (accum_gradient) delete accum_gradient; } - void update(const Tensor &gradient); + void update(const Tensor *gradient); void set_weight(Tensor *p); real *get_weight() const; diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc index d9cc3344d5932553c0b3d62be7fca297264646b6..6b3f275bf0641624d5d6ed0bded04ff38d880cdc 100644 --- a/paddle/optimizer/adam_optimizer.cc +++ b/paddle/optimizer/adam_optimizer.cc @@ -1,29 +1,32 @@ #include "adam_optimizer.h" +#include namespace paddle { namespace optimizer { void AdamOptimizer::set_weight(Tensor *p) { - size_t size = p->width(); + size_t size = p->size(); real *mptr = new real[size]; - momentums_ = Tensor(mptr, size); + momentums_ = new Tensor(mptr, size); real *vptr = new real[size]; - velocitys_ = Tensor(vtpr, size); + velocitys_ = new Tensor(vptr, size); } -void AdamOptimizer::update(const Tensor &gradient) { +void AdamOptimizer::update(const Tensor *gradient) { num_sample_passed += 1; double learning_rate = lr_policy->get_learning_rate(num_sample_passed); double coef1 = 1.0 - std::pow(beta_1, num_sample_passed); double coef2 = 1.0 - std::pow(beta_2, num_sample_passed); learning_rate *= std::sqrt(coef2) / coef1; - for (size_t i = 0; i < parameter_->size(); ++i) { - momentums_[i] = beta_1 * momentums_[i] + (1.0 - beta_1) * gradient[i]; - velocitys_[i] = - beta_2 * velocitys_[i] + (1.0 - beta_2) * gradient[i] * gradient[i]; - parameter_[i] -= - learning_rate * (momentums_[i] / std::sqrt(velocitys_[i] + epsilon) + - decay * parameter_[i]); + Tensor ¶m = *parameter_; + const Tensor &grad = *gradient; + Tensor &m = *momentums_; + Tensor &v = *velocitys_; + for (size_t i = 0; i < param.size(); ++i) { + m[i] = beta_1 * m[i] + (1.0 - beta_1) * grad[i]; + v[i] = beta_2 * v[i] + (1.0 - beta_2) * grad[i] * grad[i]; + param[i] -= + learning_rate * (m[i] / std::sqrt(v[i] + epsilon) + decay * param[i]); } } } // namespace optimizer diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h index 68e2aa0223e02988917d1bde01252724ad8ec4f4..55a44b032df87cb56d7b89a2c924719ed5b62bc1 100644 --- a/paddle/optimizer/adam_optimizer.h +++ b/paddle/optimizer/adam_optimizer.h @@ -19,7 +19,7 @@ public: if (momentums_) delete momentums_; if (velocitys_) delete velocitys_; } - void update(const Tensor &gradient); + void update(const Tensor *gradient); void set_weight(Tensor *p); real *get_weight() const; diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h index a4f39836bafc2e7c8c0c1666d58c432573dfe638..0124cfdc1916bfd5455dd6f08633d1a04a833053 100644 --- a/paddle/optimizer/parameter_optimizer.h +++ b/paddle/optimizer/parameter_optimizer.h @@ -24,7 +24,7 @@ public: virtual ~ParameterOptimizer() { delete parameter_; }; static ParameterOptimizer *create(const ::std::string &config_proto); - virtual void update(const Tensor &gradient) = 0; + virtual void update(const Tensor *gradient) = 0; virtual real *get_weight() const; virtual void set_weight(Tensor *parameter); diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h index 375c99b30b87bb84fb1459d435174a8be4448f07..4eb483c0fbd1e34b9393a9beacd67ab73c4ec3e7 100644 --- a/paddle/optimizer/sgd_optimizer.h +++ b/paddle/optimizer/sgd_optimizer.h @@ -15,7 +15,7 @@ public: SGDOptimizer(double m, double d, bool n, BaseLr* lr) : ParameterOptimizer(lr), momentum(m), decay(d), nesterov(n) {} virtual ~SGDOptimizer() { delete momentums_; } - void update(const Tensor& gradient); + void update(const Tensor* gradient); void set_weight(Tensor* p); real* get_weight() const; diff --git a/paddle/optimizer/sgd_optmizer.cc b/paddle/optimizer/sgd_optmizer.cc index 020867b93d5df96a292cb4e845703d6c7b1e62f9..5fdfc89c1f8cf9854c3d6c4f5834f202887ff8c1 100644 --- a/paddle/optimizer/sgd_optmizer.cc +++ b/paddle/optimizer/sgd_optmizer.cc @@ -13,24 +13,25 @@ void SGDOptimizer::set_weight(Tensor *p) { } } -void SGDOptimizer::update(const Tensor &gradient) { +void SGDOptimizer::update(const Tensor *gradient) { num_sample_passed += 1; double learning_rate = lr_policy->get_learning_rate(num_sample_passed); real velocity = 0.0; Tensor ¶m = *parameter_; - for (size_t i = 0; i < parameter_->size(); ++i) { + const Tensor &grad = *gradient; + Tensor &m = *momentums_; + for (size_t i = 0; i < param.size(); ++i) { if (momentum == 0.0) { - velocity = - -learning_rate * gradient[i] - learning_rate * decay * parameter_[i]; + velocity = -learning_rate * grad[i] - learning_rate * decay * param[i]; } else { - momentums_[i] = momentum * momentums_[i] - learning_rate * gradient[i] - - learning_rate * decay * parameter_[i]; - velocity = momentums_[i]; + m[i] = momentum * m[i] - learning_rate * grad[i] - + learning_rate * decay * param[i]; + velocity = m[i]; } if (nesterov) { - parameter_[i] += momentum * velocity - learning_rate * gradient[i]; + param[i] += momentum * velocity - learning_rate * grad[i]; } else { - parameter_[i] += velocity; + param[i] += velocity; } } }