diff --git a/paddle/optimizer/Tensor.h b/paddle/optimizer/Tensor.h index d5ba4b3159f7d8dcdbb8ea9b32d70288caef3982..d779bb50709153dee19dc2709608674875000006 100644 --- a/paddle/optimizer/Tensor.h +++ b/paddle/optimizer/Tensor.h @@ -5,7 +5,6 @@ */ #include -#include "optimizer.h" #include "paddle/math/BaseMatrix.h" namespace paddle { @@ -15,18 +14,27 @@ template using TensorBase = BaseMatrixT; template -class Tensor : public TensorBase { +class TensorT : public TensorBase { public: - Tensor(T* data, int size) : TensorBase(1, size, 0, data, false, false) {} + TensorT(T* data, int size) : TensorBase(1, size, 0, data, false, false) {} + TensorT(const TensorT& t) + : TensorBase(1, t.size(), 0, t.get_buffer(), false, false) {} + TensorT& operator=(const TensorT& t) { + this->size_ = t.size(); + this->data_ = t.get_buffer(); + } T* get_buffer() { return this->data_; } T& operator[](const int idx) { - CHECK(idx >= 0 && idx < this->width_) << " out of index range"; + CHECK(idx >= 0 && idx < this->width_) << "out of index range"; return this->data_[idx]; } // TODO: replace with tensorshape size_t size() const { return this->width_; } }; +// TODO(zhihong): design problem of dynamic datatype, need to fix +typedef TensorT Tensor; + } // namespace optimizer } // namespace paddle diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc index b76c123ec8038c2f5e6131b5a54d501ede92819c..c5537bde8534c9ce84e4c89bd6637bac9c265047 100644 --- a/paddle/optimizer/adadelta_optimizer.cc +++ b/paddle/optimizer/adadelta_optimizer.cc @@ -4,19 +4,17 @@ namespace paddle { namespace optimizer { -template -void AdadeltaOptimizer::set_weight(const Tensor* p) { +void AdadeltaOptimizer::set_weight(Tensor* p) { size_t size = p->size(); - T* gptr = new T[size]; - accum_gradient = Tensor(gptr, size); - T* dptr = new T[size]; - accum_delta = Tensor(dptr, size); - T* dptr_current = new T[size]; - update_delta = Tensor(dptr_current, size); + real* gptr = new real[size]; + accum_gradient = Tensor(gptr, size); + real* dptr = new real[size]; + accum_delta = Tensor(dptr, size); + real* dptr_current = new real[size]; + update_delta = Tensor(dptr_current, size); } -template -void AdadeltaOptimizer::update(const Tensor& gradient) { +void AdadeltaOptimizer::update(const Tensor& gradient) { num_sample_passed += 1; double learning_rate = lr_policy->get_learning_rate(num_sample_passed); for (size_t i = 0; i < parameter_->size(); ++i) { @@ -33,9 +31,5 @@ void AdadeltaOptimizer::update(const Tensor& gradient) { learning_rate * update_delta[i] + learning_rate * decay * parameter_[i]; } } - -template class AdadeltaOptimizer; -template class AdadeltaOptimizer; - } // namespace optimizer } // namespace paddle diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h index 35f3ff86fcfdf9463be15607a4266d4713f3e31e..d9db5d09c2269ef4d830a4fa4cf65c99c4095251 100644 --- a/paddle/optimizer/adadelta_optimizer.h +++ b/paddle/optimizer/adadelta_optimizer.h @@ -6,28 +6,27 @@ namespace paddle { namespace optimizer { -template -class AdadeltaOptimizer : public ParameterOptimizer { +class AdadeltaOptimizer : public ParameterOptimizer { public: - using ParameterOptimizer::parameter_; - using ParameterOptimizer::num_sample_passed; - using ParameterOptimizer::lr_policy; + using ParameterOptimizer::parameter_; + using ParameterOptimizer::num_sample_passed; + using ParameterOptimizer::lr_policy; AdadeltaOptimizer(double rho, double epsilon, double decay, BaseLr *lr) - : ParameterOptimizer(lr), rho(rho), epsilon(epsilon), decay(decay) {} + : ParameterOptimizer(lr), rho(rho), epsilon(epsilon), decay(decay) {} ~AdadeltaOptimizer() { if (accum_gradient) delete accum_gradient; if (accum_delta) delete accum_delta; if (update_delta) delete update_delta; } - void update(const Tensor &gradient); - void set_weight(const Tensor *p); - T *get_weight() const; + void update(const Tensor &gradient); + void set_weight(Tensor *p); + real *get_weight() const; private: - Tensor *accum_gradient; - Tensor *accum_delta; - Tensor *update_delta; + Tensor *accum_gradient; + Tensor *accum_delta; + Tensor *update_delta; double rho; double epsilon; diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc index 7b451cb4074bcf7257ff1a0654dccf2e106e1716..3d47e35896ce84e0e0033a4af00c3fdba089ed24 100644 --- a/paddle/optimizer/adagrad_optimizer.cc +++ b/paddle/optimizer/adagrad_optimizer.cc @@ -2,21 +2,18 @@ namespace paddle { namespace optimizer { -template -template -void AdagradOptimizer::set_weight(const Tensor* p) { +void AdagradOptimizer::set_weight(Tensor* p) { size_t size = p->width(); - T* gptr = new T[size]; - accum_gradient = Tensor(gptr, size); - T* dptr = new T[size]; - accum_delta = Tensor(dtpr, size); - T* dptr_current = new T[size]; - update_delta = Tensor(dptr_current, size); + real* gptr = new real[size]; + accum_gradient = Tensor(gptr, size); + real* dptr = new real[size]; + accum_delta = Tensor(dtpr, size); + real* dptr_current = new real[size]; + update_delta = Tensor(dptr_current, size); } -template -void AdagradOptimizer::update(const Tensor& gradient) { +void AdagradOptimizer::update(const Tensor& gradient) { num_sample_passed += 1; double learning_rate = lr_policy->get_learning_rate(); for (size_t i = 0; i < parameter_.size(); ++i) { @@ -27,7 +24,5 @@ void AdagradOptimizer::update(const Tensor& gradient) { } } -template class AdagradOptimizer; -template class AdagradOptimizer; } // namespace optimizer } // namespace paddle diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h index a01040f30f99ff1ef2569ae264b753fa8d7bd5c9..0f6ce06f35a076147691785327a6c0ff4a26362b 100644 --- a/paddle/optimizer/adagrad_optimizer.h +++ b/paddle/optimizer/adagrad_optimizer.h @@ -6,23 +6,19 @@ namespace paddle { namespace optimizer { -template -class AdagradOptimizer : public ParameterOptimizer { +class AdagradOptimizer : public ParameterOptimizer { public: - using ParameterOptimizer::parameter_; - using ParameterOptimizer::num_sample_passed; - using ParameterOptimizer::lr_policy; AdagradOptimizer(double epsilon, double decay, BaseLr *lr) - : ParameterOptimizer(lr), epsilon(epsilon), decay(decay) {} + : ParameterOptimizer(lr), epsilon(epsilon), decay(decay) {} ~AdagradOptimizer() { if (accum_gradient) delete accum_gradient; } - void update(const Tensor &gradient); - void set_weight(const Tensor *p); - T *get_weight() const; + void update(const Tensor &gradient); + void set_weight(Tensor *p); + real *get_weight() const; private: - Tensor *accum_gradient; + Tensor *accum_gradient; double epsilon; double decay; }; diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc index b2d2ddc596ada8275288fc63fb76d344d523a44c..d9cc3344d5932553c0b3d62be7fca297264646b6 100644 --- a/paddle/optimizer/adam_optimizer.cc +++ b/paddle/optimizer/adam_optimizer.cc @@ -3,17 +3,15 @@ namespace paddle { namespace optimizer { -template -void AdamOptimizer::set_weight(const Tensor *p) { +void AdamOptimizer::set_weight(Tensor *p) { size_t size = p->width(); - T *mptr = new T[size]; - momentums_ = Tensor(mptr, size); - T *vptr = new T[size]; - velocitys_ = Tensor(vtpr, size); + real *mptr = new real[size]; + momentums_ = Tensor(mptr, size); + real *vptr = new real[size]; + velocitys_ = Tensor(vtpr, size); } -template -void AdamOptimizer::update(const Tensor &gradient) { +void AdamOptimizer::update(const Tensor &gradient) { num_sample_passed += 1; double learning_rate = lr_policy->get_learning_rate(num_sample_passed); double coef1 = 1.0 - std::pow(beta_1, num_sample_passed); @@ -28,8 +26,5 @@ void AdamOptimizer::update(const Tensor &gradient) { decay * parameter_[i]); } } - -template class AdamOptimizer; -template class AdamOptimizer; } // namespace optimizer } // namespace paddle diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h index cf81bd70a64db123ca6e4214b061781a8c510963..68e2aa0223e02988917d1bde01252724ad8ec4f4 100644 --- a/paddle/optimizer/adam_optimizer.h +++ b/paddle/optimizer/adam_optimizer.h @@ -6,15 +6,11 @@ namespace paddle { namespace optimizer { -template -class AdamOptimizer : public ParameterOptimizer { +class AdamOptimizer : public ParameterOptimizer { public: - using ParameterOptimizer::parameter_; - using ParameterOptimizer::num_sample_passed; - using ParameterOptimizer::lr_policy; AdamOptimizer( double beta_1, double beta_2, double epsilon, double decay, BaseLr *lr) - : ParameterOptimizer(lr), + : ParameterOptimizer(lr), beta_1(beta_1), beta_2(beta_2), epsilon(epsilon), @@ -23,13 +19,13 @@ public: if (momentums_) delete momentums_; if (velocitys_) delete velocitys_; } - void update(const Tensor &gradient); - void set_weight(const Tensor *p); - T *get_weight() const; + void update(const Tensor &gradient); + void set_weight(Tensor *p); + real *get_weight() const; private: - Tensor *momentums_; - Tensor *velocitys_; + Tensor *momentums_; + Tensor *velocitys_; double beta_1; double beta_2; double epsilon; diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h index 5aad87f45fb0619e514de00e65a5ceb9dac0ef01..e1017cf32dcfc8b5bcd4591e971cff2a5ccd5b58 100644 --- a/paddle/optimizer/lr_policy.h +++ b/paddle/optimizer/lr_policy.h @@ -1,6 +1,7 @@ #ifndef PADDLE_OPTIMIZER_LR_POLICY_H_ #define PADDLE_OPTIMIZER_LR_POLICY_H_ +#include #include "OptimizerConfig.pb.h" namespace paddle { @@ -19,11 +20,25 @@ protected: // constant learning rate policy class ConstLr final : public BaseLr { public: + ConstLr(double lr) : BaseLr(lr){}; double get_learning_rate(const uint64_t num_sample_passed) { return learning_rate; } }; +class LinearLr final : public BaseLr { +public: + LinearLr(double lr, double lr_decay_a, double lr_decay_b) + : BaseLr(lr), lr_decay_a(lr_decay_a), lr_decay_b(lr_decay_b) {} + double get_learning_rate(const uint64_t num_sample_passed) { + return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b); + } + +private: + double lr_decay_a; + double lr_decay_b; +}; + } // namespace optimizer } // namespace paddle diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc index ff6558147eeb866351f724e559e0e1841ac15718..fb2e543bf32570c9744296c50256663aee7da4ff 100644 --- a/paddle/optimizer/optimizer.cc +++ b/paddle/optimizer/optimizer.cc @@ -2,8 +2,9 @@ #include #include "parameter_optimizer.h" +using namespace paddle::optimizer; -template +template struct EnumToType {}; template @@ -26,17 +27,16 @@ MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64); MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64); MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32); MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64); - struct paddle_optimizer { - /*! \brief optmizer in C++ side */ - paddle::optimizer::ParameterOptimizerBase* impl; +struct paddle_optimizer { + paddle::optimizer::ParameterOptimizer* impl; }; paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto, int config_proto_len) { - paddle_optimizer* optimizer; + paddle_optimizer* optimizer = new paddle_optimizer; std::string config(config_proto, config_proto + config_proto_len); - optimizer->impl->create(config_proto); + optimizer->impl = ParameterOptimizer::create(config); return optimizer; } @@ -49,9 +49,9 @@ int paddle_update_parameter(paddle_optimizer* o, const paddle_element_type data_type, const void* grad_buffer, int num_bytes) { - auto type = EnumToType::Type; - paddle::Tensor gradient(reinterpret_cast(grad_buffer), - num_bytes); + // TOOD(zhihong): datatype not work. need to add the runtime datatype + auto grad = reinterpret_cast(grad_buffer); + Tensor gradient(const_cast(grad), num_bytes); o->impl->update(gradient); return PADDLE_SUCCESS; } @@ -60,9 +60,8 @@ int paddle_optimizer_set_weights(paddle_optimizer* o, const paddle_element_type data_type, void* param_buffer, int num_bytes) { - auto type = EnumToType::Type; - paddle::Tensor* param = new paddle::Tensor( - reinterpret_cast(param_buffer), num_bytes); + // TOOD(zhihong): datatype not work. need to add the runtime datatype + Tensor* param = new Tensor(reinterpret_cast(param_buffer), num_bytes); o->impl->set_weight(param); return PADDLE_SUCCESS; } diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/optimizer/parameter_optimizer.cc index 4bd384d77da1deb7f86c72cca6f66947eddb726f..6d9fa5c8024013250e6e0b76707c9f0b467f5646 100644 --- a/paddle/optimizer/parameter_optimizer.cc +++ b/paddle/optimizer/parameter_optimizer.cc @@ -10,78 +10,60 @@ namespace paddle { namespace optimizer { -template -ParameterOptimizer *ParameterOptimizer::create( +ParameterOptimizer *ParameterOptimizer::create( const ::std::string &config_proto) { paddle::OptimizerConfig config; CHECK(config.ParseFromString(config_proto) == 0) << "error : optimizer config"; - CHECK(config_valid(config) == 0) << "error : invalid optimizer config "; - BaseLr *lr = nullptr; - switch (config.lr_policy()) { - case "ConstLr": - lr = new ConstLr(config.lr_config().learning_rate()); - break; - } - ParameterOptimizer *opt = nullptr; - switch (config.optimizer_name()) { - case "SGD": - opt = new SGDOptimizer(config.sgd().momentum(), - config.sgd().decay(), - config.sgd().nesterov(), - lr); - break; - case "Adagrad": - opt = new AdagradOptimizer( + auto select_lr_policy = [=](const OptimizerConfig &config) -> BaseLr * { + std::string s(config.lr_policy()); + if (s == "ConstLr") return new ConstLr(config.lr_config().learning_rate()); + if (s == "LinearLr") + return new LinearLr(config.lr_config().learning_rate(), + config.lr_config().lr_decay_a(), + config.lr_config().lr_decay_b()); + // default + return new ConstLr(config.lr_config().learning_rate()); + }; + BaseLr *lr = select_lr_policy(config); + auto select_optimizer = + [=](const OptimizerConfig &config) -> ParameterOptimizer * { + std::string s(config.optimizer_name()); + if (s == "SGD") { + return new SGDOptimizer(config.sgd().momentum(), + config.sgd().decay(), + config.sgd().nesterov(), + lr); + } + if (s == "Adadelta") { + return new AdagradOptimizer( config.adagrad().epsilon(), config.adagrad().decay(), lr); - break; - case "Adadelta": - opt = new AdadeltaOptimizer(config.adadelta().rho(), - config.adadelta().epsilon(), - config.adadelta().decay(), - lr); - break; - case "Adam": - opt = new AdamOptimizer(config.adam().beta_1(), - config.adam().beta_2(), - config.adam().epsilon(), - config.adam().decay(), - lr); - break; - } - - return opt; -} - -template -T *ParameterOptimizer::get_weight() const { - return parameter.get().get_buffer(); -} - -template -char *ParameterOptimizer::get_config_proto() const { - // set config dynamic value for save checkpoint - config_.lr_policy().set_learning_rate( - lr_policy->get_learning_rate(num_sample_passed)); - config_.set_num_sample_passed(num_sample_passed); - config_.set_iterations(iterations); - return config_.SerializeAsString().c_str(); -} - -template -void ParameterOptimizer::set_weight(const Tensor *p) { - parameter_ = p; + } + if (s == "Adagrad") { + return new AdagradOptimizer( + config.adagrad().epsilon(), config.adagrad().decay(), lr); + } + if (s == "Adam") { + return new AdadeltaOptimizer(config.adadelta().rho(), + config.adadelta().epsilon(), + config.adadelta().decay(), + lr); + } + // default + return new SGDOptimizer(config.sgd().momentum(), + config.sgd().decay(), + config.sgd().nesterov(), + lr); + }; + return select_optimizer(config); } -template -bool ParameterOptimizer::config_valid(const ::std::string &config) const { - // TODO(zhihong) : add more value checker, failed ASAP - return true; +real *ParameterOptimizer::get_weight() const { + return parameter_->get_buffer(); } -template class ParameterOptimzier; -template class ParameterOptimzier; +void ParameterOptimizer::set_weight(Tensor *p) { parameter_ = p; } } // namespace optimizer } // namespace paddle diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h index 40994aa86703b0437f0795f1c0323c26596c564a..a4f39836bafc2e7c8c0c1666d58c432573dfe638 100644 --- a/paddle/optimizer/parameter_optimizer.h +++ b/paddle/optimizer/parameter_optimizer.h @@ -11,13 +11,6 @@ namespace paddle { namespace optimizer { -class ParameterOptimizerBase { -private: - ParameterOptimizerBase(const ParameterOptimizerBase &) = delete; - ParameterOptimizerBase &operator=(const ParameterOptimizerBase &) = delete; -}; - -template class ParameterOptimizer { public: /** @@ -31,14 +24,13 @@ public: virtual ~ParameterOptimizer() { delete parameter_; }; static ParameterOptimizer *create(const ::std::string &config_proto); - virtual void update(const Tensor &gradient) = 0; - virtual T *get_weight() const; - virtual void set_weight(const Tensor *parameter); + virtual void update(const Tensor &gradient) = 0; + virtual real *get_weight() const; + virtual void set_weight(Tensor *parameter); public: - bool config_valid(::std::string &config) const; OptimizerConfig config_; - Tensor *parameter_; + Tensor *parameter_; // learning rate policy BaseLr *lr_policy; diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h index 5115825590d82baded7e2384ed54f5bd75402058..375c99b30b87bb84fb1459d435174a8be4448f07 100644 --- a/paddle/optimizer/sgd_optimizer.h +++ b/paddle/optimizer/sgd_optimizer.h @@ -6,31 +6,22 @@ namespace paddle { namespace optimizer { -template -class SGDOptimizer : public ParameterOptimizer { +class SGDOptimizer : public ParameterOptimizer { public: - using ParameterOptimizer::parameter_; - using ParameterOptimizer::num_sample_passed; - using ParameterOptimizer::lr_policy; - - SGDOptimizer(double m, - double d, - bool n, - double learning_rate, - uint64_t num_sample_passed, - BaseLr* lr) - : ParameterOptimizer(lr), momentum(m), decay(d), nesterov(n) {} - virtual ~SGDOptimizer() { - // clear memory by Tensor library - delete momentums_; - } - void update(const Tensor& gradient); - - void set_weight(const Tensor* p); - T* get_weight() const; + using ParameterOptimizer::parameter_; + using ParameterOptimizer::num_sample_passed; + using ParameterOptimizer::lr_policy; + + SGDOptimizer(double m, double d, bool n, BaseLr* lr) + : ParameterOptimizer(lr), momentum(m), decay(d), nesterov(n) {} + virtual ~SGDOptimizer() { delete momentums_; } + void update(const Tensor& gradient); + + void set_weight(Tensor* p); + real* get_weight() const; private: - Tensor* momentums_; + Tensor* momentums_; double momentum; double decay; bool nesterov; diff --git a/paddle/optimizer/sgd_optmizer.cc b/paddle/optimizer/sgd_optmizer.cc index cd1635fecdca0cf9803849ff3a6a4b6eb265c32f..03ddc81451715b458b33afdeb10d9bfd24f1b354 100644 --- a/paddle/optimizer/sgd_optmizer.cc +++ b/paddle/optimizer/sgd_optmizer.cc @@ -3,23 +3,21 @@ namespace paddle { namespace optimizer { -template -void SGDOptimizer::set_weight(const Tensor *p) { +void SGDOptimizer::set_weight(Tensor *p) { // ParameterOptimizer::set_weight(p); size_t size = p->size(); // TODO: fix it with align aware allocator bind to Tensor if (momentum != 0.0) { - T *ptr = new T[size]; - momentums_ = Tensor(ptr, size); + real *ptr = new real[size]; + momentums_ = new Tensor(ptr, size); } } -template -void SGDOptimizer::update(const Tensor &gradient) { +void SGDOptimizer::update(const Tensor &gradient) { num_sample_passed += 1; double learning_rate = lr_policy->get_learning_rate(num_sample_passed); double velocity = 0.0; - Tensor &for (size_t i = 0; i < parameter_->size(); ++i) { + for (size_t i = 0; i < parameter_->size(); ++i) { if (momentum == 0.0) { velocity = -learning_rate * gradient[i] - learning_rate * decay * parameter_[i]; @@ -36,8 +34,5 @@ void SGDOptimizer::update(const Tensor &gradient) { } } -template class SGDOptimizer; -template class SGDOptimizer; - } // namespace optimizer } // namespace paddle