diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt index 134ca9e9d644d8cd6c2eef48a1a9f3921c7684c3..06f6d83efe1155c8cbe76644d6b9efc91349b006 100644 --- a/paddle/optimizer/CMakeLists.txt +++ b/paddle/optimizer/CMakeLists.txt @@ -1,9 +1,9 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) set(OPITMIZER_SRCS - adadelta_optimizer.cc - adagrad_optimizer.cc - adam_optimizer.cc + # adadelta_optimizer.cc + # adagrad_optimizer.cc + # adam_optimizer.cc optimizer.cc parameter_optimizer.cc sgd_optmizer.cc @@ -11,9 +11,9 @@ set(OPITMIZER_SRCS ) set(OPITMIZER_Headers - adadelta_optimizer.h - adagrad_optimizer.h - adam_optimizer.h + # adadelta_optimizer.h + # adagrad_optimizer.h + # adam_optimizer.h lr_policy.h optimizer.h parameter_optimizer.h diff --git a/paddle/optimizer/Tensor.h b/paddle/optimizer/Tensor.h index a8387c4df41ec76cce69b03d58abfb2c7863c655..d5ba4b3159f7d8dcdbb8ea9b32d70288caef3982 100644 --- a/paddle/optimizer/Tensor.h +++ b/paddle/optimizer/Tensor.h @@ -5,6 +5,7 @@ */ #include +#include "optimizer.h" #include "paddle/math/BaseMatrix.h" namespace paddle { @@ -16,10 +17,14 @@ using TensorBase = BaseMatrixT; template class Tensor : public TensorBase { public: - Tensor(T* data, int size) : TensorBase(size, 1, 0, data, false, false) {} + Tensor(T* data, int size) : TensorBase(1, size, 0, data, false, false) {} T* get_buffer() { return this->data_; } + T& operator[](const int idx) { + CHECK(idx >= 0 && idx < this->width_) << " out of index range"; + return this->data_[idx]; + } // TODO: replace with tensorshape - size_t width() { return this->width_; } + size_t size() const { return this->width_; } }; } // namespace optimizer diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc index 39d465cebe6826eeaf41d506d707ece237cfcfeb..b76c123ec8038c2f5e6131b5a54d501ede92819c 100644 --- a/paddle/optimizer/adadelta_optimizer.cc +++ b/paddle/optimizer/adadelta_optimizer.cc @@ -3,21 +3,14 @@ namespace paddle { namespace optimizer { -template -AdadeltaOptimizer::AdadeltaOptimizer(const ::paddle::OptimizerConfig& config) - : ParameterOptimizer(config) { - rho = config.adadelta().rho(); - epsilon = config.adadelta().epsilon(); - decay = config.adadelta().decay(); -} template void AdadeltaOptimizer::set_weight(const Tensor* p) { - size_t size = p->width(); + size_t size = p->size(); T* gptr = new T[size]; accum_gradient = Tensor(gptr, size); T* dptr = new T[size]; - accum_delta = Tensor(dtpr, size); + accum_delta = Tensor(dptr, size); T* dptr_current = new T[size]; update_delta = Tensor(dptr_current, size); } @@ -25,8 +18,8 @@ void AdadeltaOptimizer::set_weight(const Tensor* p) { template void AdadeltaOptimizer::update(const Tensor& gradient) { num_sample_passed += 1; - double learning_rate = lr_policy->get_learning_rate(); - for (size_t i = 0; i < parameter_.size(); ++i) { + double learning_rate = lr_policy->get_learning_rate(num_sample_passed); + for (size_t i = 0; i < parameter_->size(); ++i) { accum_gradient[i] = rho * accum_gradient[i] + (1.0 - rho) * gradient[i] * gradient[i]; @@ -36,7 +29,8 @@ void AdadeltaOptimizer::update(const Tensor& gradient) { accum_delta[i] = rho * accum_delta[i] + (1.0 - rho) * update_delta[i] * update_delta[i]; - parameter_[i] -= update_delta[i] + decay * parameter_[i]; + parameter_[i] -= + learning_rate * update_delta[i] + learning_rate * decay * parameter_[i]; } } diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h index 1a8c03f26827cbf8fce57d2393badc0affcb384c..35f3ff86fcfdf9463be15607a4266d4713f3e31e 100644 --- a/paddle/optimizer/adadelta_optimizer.h +++ b/paddle/optimizer/adadelta_optimizer.h @@ -9,7 +9,12 @@ namespace optimizer { template class AdadeltaOptimizer : public ParameterOptimizer { public: - AdadeltaOptimizer(const OptimizerConfig &config); + using ParameterOptimizer::parameter_; + using ParameterOptimizer::num_sample_passed; + using ParameterOptimizer::lr_policy; + + AdadeltaOptimizer(double rho, double epsilon, double decay, BaseLr *lr) + : ParameterOptimizer(lr), rho(rho), epsilon(epsilon), decay(decay) {} ~AdadeltaOptimizer() { if (accum_gradient) delete accum_gradient; if (accum_delta) delete accum_delta; diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc index 40402a671081d0ffb932b18c9d46343e86495ef9..7b451cb4074bcf7257ff1a0654dccf2e106e1716 100644 --- a/paddle/optimizer/adagrad_optimizer.cc +++ b/paddle/optimizer/adagrad_optimizer.cc @@ -3,11 +3,6 @@ namespace paddle { namespace optimizer { template -AdagradOptimizer::AdagradOptimizer(const ::paddle::OptimizerConfig& config) - : ParameterOptimizer(config) { - epsilon = config.adagrad().epsilon(); - decay = config.adagrad().decay(); -} template void AdagradOptimizer::set_weight(const Tensor* p) { diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h index 1ec438fd05a5ce6d1eea0e521a8fd1c722af8244..a01040f30f99ff1ef2569ae264b753fa8d7bd5c9 100644 --- a/paddle/optimizer/adagrad_optimizer.h +++ b/paddle/optimizer/adagrad_optimizer.h @@ -9,7 +9,11 @@ namespace optimizer { template class AdagradOptimizer : public ParameterOptimizer { public: - AdagradOptimizer(const OptimizerConfig &config); + using ParameterOptimizer::parameter_; + using ParameterOptimizer::num_sample_passed; + using ParameterOptimizer::lr_policy; + AdagradOptimizer(double epsilon, double decay, BaseLr *lr) + : ParameterOptimizer(lr), epsilon(epsilon), decay(decay) {} ~AdagradOptimizer() { if (accum_gradient) delete accum_gradient; } diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc index c2303c6545e56a00c01b99785d78a3eb47b71d0d..b2d2ddc596ada8275288fc63fb76d344d523a44c 100644 --- a/paddle/optimizer/adam_optimizer.cc +++ b/paddle/optimizer/adam_optimizer.cc @@ -2,14 +2,6 @@ namespace paddle { namespace optimizer { -template -AdamOptimizer::AdamOptimizer(const ::paddle::OptimizerConfig &config) - : ParameterOptimizer(config) { - beta_1 = config.adam().beta_1(); - beta_2 = config.adam().beta_2(); - epsilon = config.adam().epsilon(); - decay = config.adam().decay(); -} template void AdamOptimizer::set_weight(const Tensor *p) { @@ -23,11 +15,16 @@ void AdamOptimizer::set_weight(const Tensor *p) { template void AdamOptimizer::update(const Tensor &gradient) { num_sample_passed += 1; - double learning_rate = lr_policy->get_learning_rate(); - for (size_t i = 0; i < parameter_.size(); ++i) { - accum_gradient[i] += gradient[i] * gradient[i]; - parameter_[i] += - learning_rate * (gradient[i] / std::sqrt(accum_gradient[i] + epsilon) + + double learning_rate = lr_policy->get_learning_rate(num_sample_passed); + double coef1 = 1.0 - std::pow(beta_1, num_sample_passed); + double coef2 = 1.0 - std::pow(beta_2, num_sample_passed); + learning_rate *= std::sqrt(coef2) / coef1; + for (size_t i = 0; i < parameter_->size(); ++i) { + momentums_[i] = beta_1 * momentums_[i] + (1.0 - beta_1) * gradient[i]; + velocitys_[i] = + beta_2 * velocitys_[i] + (1.0 - beta_2) * gradient[i] * gradient[i]; + parameter_[i] -= + learning_rate * (momentums_[i] / std::sqrt(velocitys_[i] + epsilon) + decay * parameter_[i]); } } diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h index ceec18eb33603ccaabdcdbce1f65b23aed833e94..cf81bd70a64db123ca6e4214b061781a8c510963 100644 --- a/paddle/optimizer/adam_optimizer.h +++ b/paddle/optimizer/adam_optimizer.h @@ -9,8 +9,20 @@ namespace optimizer { template class AdamOptimizer : public ParameterOptimizer { public: - AdamOptimizer(const OptimizerConfig &config); - ~AdamOptimizer() {} + using ParameterOptimizer::parameter_; + using ParameterOptimizer::num_sample_passed; + using ParameterOptimizer::lr_policy; + AdamOptimizer( + double beta_1, double beta_2, double epsilon, double decay, BaseLr *lr) + : ParameterOptimizer(lr), + beta_1(beta_1), + beta_2(beta_2), + epsilon(epsilon), + decay(decay) {} + ~AdamOptimizer() { + if (momentums_) delete momentums_; + if (velocitys_) delete velocitys_; + } void update(const Tensor &gradient); void set_weight(const Tensor *p); T *get_weight() const; diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h index 6977b68de7ba788d5ea3b7707364a07ca6bf4f65..5aad87f45fb0619e514de00e65a5ceb9dac0ef01 100644 --- a/paddle/optimizer/lr_policy.h +++ b/paddle/optimizer/lr_policy.h @@ -1,19 +1,18 @@ #ifndef PADDLE_OPTIMIZER_LR_POLICY_H_ #define PADDLE_OPTIMIZER_LR_POLICY_H_ -#include "OptimizerConfig.ph.h" +#include "OptimizerConfig.pb.h" namespace paddle { namespace optimizer { class BaseLr { public: - LrPolicyBase(const OpitmizerConfig &config) { - learning_rate = config.lr_config().learning_rate(); - } + BaseLr(double lr) : learning_rate(lr) {} + virtual ~BaseLr() {} virtual double get_learning_rate(const uint64_t num_sample_passed) = 0; -private: +protected: double learning_rate; }; diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc index e72881e5d0dba8242055f734c7681def22f65962..ff6558147eeb866351f724e559e0e1841ac15718 100644 --- a/paddle/optimizer/optimizer.cc +++ b/paddle/optimizer/optimizer.cc @@ -3,7 +3,7 @@ #include "parameter_optimizer.h" -template +template struct EnumToType {}; template @@ -11,15 +11,14 @@ struct TypeToEnum {}; #define MATCH_ENUM_TYPE(TYPE, ENUM) \ template <> \ - struct TypeToEnum { \ + struct TypeToEnum { \ static paddle_element_type v() { return ENUM; }; \ - static constexpr TYPE value = ENUM; -} -; -template <> -struct EnumToType { - typedef TYPE Type; -} + static constexpr TYPE value = ENUM; \ + }; \ + template <> \ + struct EnumToType { \ + typedef TYPE Type; \ + } MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32); MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32); @@ -27,11 +26,10 @@ MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64); MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64); MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32); MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64); - -struct paddle_optimizer { + struct paddle_optimizer { /*! \brief optmizer in C++ side */ - paddle::optimizer::ParameterOptimzier* impl; + paddle::optimizer::ParameterOptimizerBase* impl; }; paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto, @@ -48,7 +46,7 @@ int paddle_release_optimizer(paddle_optimizer* o) { } int paddle_update_parameter(paddle_optimizer* o, - paddle_element_type data_type, + const paddle_element_type data_type, const void* grad_buffer, int num_bytes) { auto type = EnumToType::Type; @@ -59,7 +57,7 @@ int paddle_update_parameter(paddle_optimizer* o, } int paddle_optimizer_set_weights(paddle_optimizer* o, - paddle_element_type data_type, + const paddle_element_type data_type, void* param_buffer, int num_bytes) { auto type = EnumToType::Type; diff --git a/paddle/optimizer/optimizer.h b/paddle/optimizer/optimizer.h index 0eba2e781187ccc894134aa69bfc7f8d896645d1..a2c2b13405b296e773ba6f8d5a480e9a39bf7e5a 100644 --- a/paddle/optimizer/optimizer.h +++ b/paddle/optimizer/optimizer.h @@ -64,7 +64,7 @@ int paddle_release_optimizer(paddle_optimizer* o); * @return return exec status */ int paddle_update_parameter(paddle_optimizer* o, - paddle_element_type data_type, + const paddle_element_type data_type, const void* gradient, int num_bytes); @@ -76,7 +76,7 @@ int paddle_update_parameter(paddle_optimizer* o, * @return return exec status */ int paddle_optimizer_set_weights(paddle_optimizer* o, - paddle_element_type data_type, + const paddle_element_type data_type, void* param_buffer, int num_bytes); diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/optimizer/parameter_optimizer.cc index c5e9e0acc30300c07200aad708a5591eba335a6c..4bd384d77da1deb7f86c72cca6f66947eddb726f 100644 --- a/paddle/optimizer/parameter_optimizer.cc +++ b/paddle/optimizer/parameter_optimizer.cc @@ -1,6 +1,11 @@ -#include "parameter_optimizer.h" #include -#include "optimizer_factory.h" +#include "adadelta_optimizer.h" +#include "adagrad_optimizer.h" +#include "adam_optimizer.h" +#include "lr_policy.h" +#include "sgd_optimizer.h" + +#include "parameter_optimizer.h" namespace paddle { namespace optimizer { @@ -12,29 +17,40 @@ ParameterOptimizer *ParameterOptimizer::create( CHECK(config.ParseFromString(config_proto) == 0) << "error : optimizer config"; CHECK(config_valid(config) == 0) << "error : invalid optimizer config "; + + BaseLr *lr = nullptr; + switch (config.lr_policy()) { + case "ConstLr": + lr = new ConstLr(config.lr_config().learning_rate()); + break; + } ParameterOptimizer *opt = nullptr; switch (config.optimizer_name()) { case "SGD": - opt = new SGDOptimizer(config); + opt = new SGDOptimizer(config.sgd().momentum(), + config.sgd().decay(), + config.sgd().nesterov(), + lr); break; case "Adagrad": - opt = new AdagradOptimizer(config); + opt = new AdagradOptimizer( + config.adagrad().epsilon(), config.adagrad().decay(), lr); break; case "Adadelta": - opt = new AdadeltaOptimizer(config); + opt = new AdadeltaOptimizer(config.adadelta().rho(), + config.adadelta().epsilon(), + config.adadelta().decay(), + lr); break; case "Adam": - opt = new AdamOptimizer(config); + opt = new AdamOptimizer(config.adam().beta_1(), + config.adam().beta_2(), + config.adam().epsilon(), + config.adam().decay(), + lr); break; - default: - opt = new SGDOptimizer(config); } - switch (config.lr_policy()) { - case "ConstLr": - opt.lr_policy = new ConstLr(config); - break; - } return opt; } diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h index d5914857af06a9fa077d0c8d33669d7978ff5681..40994aa86703b0437f0795f1c0323c26596c564a 100644 --- a/paddle/optimizer/parameter_optimizer.h +++ b/paddle/optimizer/parameter_optimizer.h @@ -11,6 +11,12 @@ namespace paddle { namespace optimizer { +class ParameterOptimizerBase { +private: + ParameterOptimizerBase(const ParameterOptimizerBase &) = delete; + ParameterOptimizerBase &operator=(const ParameterOptimizerBase &) = delete; +}; + template class ParameterOptimizer { public: @@ -18,18 +24,18 @@ public: * @brief update hook for algorithm need to traverse parameter more than * once. */ + // use config for pack trainig state ParameterOptimizer(const OptimizerConfig &config) : config_(config){}; + ParameterOptimizer(BaseLr *lr) : lr_policy(lr), num_sample_passed(0) {} + virtual ~ParameterOptimizer() { delete parameter_; }; + static ParameterOptimizer *create(const ::std::string &config_proto); - virtual void update(const Tensor &gradient) = 0; - virtual void destroy() = 0; + virtual void update(const Tensor &gradient) = 0; virtual T *get_weight() const; virtual void set_weight(const Tensor *parameter); - // package optimizer config proto in runtime for saving checkpoint - virtual char *get_config_proto(); - ~ParameterOptimzier() { delete parameter_; } -private: +public: bool config_valid(::std::string &config) const; OptimizerConfig config_; Tensor *parameter_; @@ -37,12 +43,6 @@ private: // learning rate policy BaseLr *lr_policy; uint64_t num_sample_passed; - - ParameterOptimizer(const ParameterOptimizer &) = delete; - ParameterOptimizer &operator=(const ParameterOptimizer &) = delete; - /** - * @brief indicate if use L1, L2 regularizer - */ }; } // namespace optimizer diff --git a/paddle/optimizer/regularizer.cc b/paddle/optimizer/regularizer.cc index dd21c20e711940242abb31a0c169e5f1026cecf8..5724511a827f4521a31c05f38483490fe094e7fa 100644 --- a/paddle/optimizer/regularizer.cc +++ b/paddle/optimizer/regularizer.cc @@ -19,6 +19,8 @@ Regularizer* Regularizer::create(const std::string& config) { template class L1Regularizer; template class L1Regularizer; +template class L2Regularizer; +template class L2Regularizer; } // namespace optimizer } // namespace paddle diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h index 4e1d9669c96c73f2152179eee90b0a765e809b8e..5115825590d82baded7e2384ed54f5bd75402058 100644 --- a/paddle/optimizer/sgd_optimizer.h +++ b/paddle/optimizer/sgd_optimizer.h @@ -9,8 +9,18 @@ namespace optimizer { template class SGDOptimizer : public ParameterOptimizer { public: - SGDOptimizer(const ::paddle::OptimizerConfig& config); - ~SGDOptimizer() { + using ParameterOptimizer::parameter_; + using ParameterOptimizer::num_sample_passed; + using ParameterOptimizer::lr_policy; + + SGDOptimizer(double m, + double d, + bool n, + double learning_rate, + uint64_t num_sample_passed, + BaseLr* lr) + : ParameterOptimizer(lr), momentum(m), decay(d), nesterov(n) {} + virtual ~SGDOptimizer() { // clear memory by Tensor library delete momentums_; } @@ -18,7 +28,6 @@ public: void set_weight(const Tensor* p); T* get_weight() const; - char* get_config_proto(); private: Tensor* momentums_; diff --git a/paddle/optimizer/sgd_optmizer.cc b/paddle/optimizer/sgd_optmizer.cc index ff23d46dc6f713a062f0841a854bb6a538907085..cd1635fecdca0cf9803849ff3a6a4b6eb265c32f 100644 --- a/paddle/optimizer/sgd_optmizer.cc +++ b/paddle/optimizer/sgd_optmizer.cc @@ -3,18 +3,10 @@ namespace paddle { namespace optimizer { -template -SGDOptimizer::SGDOptimizer(const ::paddle::OptimizerConfig &config) - : ParameterOptimizer(config) { - momentum = config.sgd().momentum(); - decay = config.sgd().decay(); - nesterov = config.sgd().nesterov(); -} - template void SGDOptimizer::set_weight(const Tensor *p) { // ParameterOptimizer::set_weight(p); - size_t size = p->width(); + size_t size = p->size(); // TODO: fix it with align aware allocator bind to Tensor if (momentum != 0.0) { T *ptr = new T[size]; @@ -27,7 +19,7 @@ void SGDOptimizer::update(const Tensor &gradient) { num_sample_passed += 1; double learning_rate = lr_policy->get_learning_rate(num_sample_passed); double velocity = 0.0; - for (size_t i = 0; i < parameter_.size(); ++i) { + Tensor &for (size_t i = 0; i < parameter_->size(); ++i) { if (momentum == 0.0) { velocity = -learning_rate * gradient[i] - learning_rate * decay * parameter_[i]; @@ -44,15 +36,6 @@ void SGDOptimizer::update(const Tensor &gradient) { } } -template -char *SGDOptimizer::get_config_proto() { - ParameterOptimizer::get_config_proto(); - config.set_learning_rate(learning_rate); - config.set_decay(decay); - config.set_nesterov(nesterov); - return config.SerializeAsString().c_str(); -} - template class SGDOptimizer; template class SGDOptimizer; diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto index c1080f4e168a363ca1d637834629206a3513a5bc..d42426765073a1d27b265d2ef5feff94535cdf48 100644 --- a/proto/OptimizerConfig.proto +++ b/proto/OptimizerConfig.proto @@ -12,7 +12,7 @@ message SGDConfig { optional double momentum = 21 [default = 0.0]; optional double decay = 23 [default = 0.0]; optional bool nesterov =24 [default = false]; - +} message AdadeltaConfig { @@ -95,5 +95,4 @@ message OptimizerConfig { // common config of optimizer optional double clipnorm = 101; optional double clipvalue = 102; - }