提交 5b8a0c5d 编写于 作者: D dzhwinter

"optimizer remove init create with proto"

上级 3158efe9
include_directories(${CMAKE_CURRENT_BINARY_DIR})
set(OPITMIZER_SRCS
adadelta_optimizer.cc
adagrad_optimizer.cc
adam_optimizer.cc
# adadelta_optimizer.cc
# adagrad_optimizer.cc
# adam_optimizer.cc
optimizer.cc
parameter_optimizer.cc
sgd_optmizer.cc
......@@ -11,9 +11,9 @@ set(OPITMIZER_SRCS
)
set(OPITMIZER_Headers
adadelta_optimizer.h
adagrad_optimizer.h
adam_optimizer.h
# adadelta_optimizer.h
# adagrad_optimizer.h
# adam_optimizer.h
lr_policy.h
optimizer.h
parameter_optimizer.h
......
......@@ -5,6 +5,7 @@
*/
#include <string.h>
#include "optimizer.h"
#include "paddle/math/BaseMatrix.h"
namespace paddle {
......@@ -16,10 +17,14 @@ using TensorBase = BaseMatrixT<T>;
template <class T>
class Tensor : public TensorBase<T> {
public:
Tensor(T* data, int size) : TensorBase<T>(size, 1, 0, data, false, false) {}
Tensor(T* data, int size) : TensorBase<T>(1, size, 0, data, false, false) {}
T* get_buffer() { return this->data_; }
T& operator[](const int idx) {
CHECK(idx >= 0 && idx < this->width_) << " out of index range";
return this->data_[idx];
}
// TODO: replace with tensorshape
size_t width() { return this->width_; }
size_t size() const { return this->width_; }
};
} // namespace optimizer
......
......@@ -3,21 +3,14 @@
namespace paddle {
namespace optimizer {
template <class T>
AdadeltaOptimizer<T>::AdadeltaOptimizer(const ::paddle::OptimizerConfig& config)
: ParameterOptimizer<T>(config) {
rho = config.adadelta().rho();
epsilon = config.adadelta().epsilon();
decay = config.adadelta().decay();
}
template <class T>
void AdadeltaOptimizer<T>::set_weight(const Tensor<T>* p) {
size_t size = p->width();
size_t size = p->size();
T* gptr = new T[size];
accum_gradient = Tensor<T>(gptr, size);
T* dptr = new T[size];
accum_delta = Tensor<T>(dtpr, size);
accum_delta = Tensor<T>(dptr, size);
T* dptr_current = new T[size];
update_delta = Tensor<T>(dptr_current, size);
}
......@@ -25,8 +18,8 @@ void AdadeltaOptimizer<T>::set_weight(const Tensor<T>* p) {
template <class T>
void AdadeltaOptimizer<T>::update(const Tensor<T>& gradient) {
num_sample_passed += 1;
double learning_rate = lr_policy->get_learning_rate();
for (size_t i = 0; i < parameter_.size(); ++i) {
double learning_rate = lr_policy->get_learning_rate(num_sample_passed);
for (size_t i = 0; i < parameter_->size(); ++i) {
accum_gradient[i] =
rho * accum_gradient[i] + (1.0 - rho) * gradient[i] * gradient[i];
......@@ -36,7 +29,8 @@ void AdadeltaOptimizer<T>::update(const Tensor<T>& gradient) {
accum_delta[i] =
rho * accum_delta[i] + (1.0 - rho) * update_delta[i] * update_delta[i];
parameter_[i] -= update_delta[i] + decay * parameter_[i];
parameter_[i] -=
learning_rate * update_delta[i] + learning_rate * decay * parameter_[i];
}
}
......
......@@ -9,7 +9,12 @@ namespace optimizer {
template <class T>
class AdadeltaOptimizer : public ParameterOptimizer<T> {
public:
AdadeltaOptimizer(const OptimizerConfig &config);
using ParameterOptimizer<T>::parameter_;
using ParameterOptimizer<T>::num_sample_passed;
using ParameterOptimizer<T>::lr_policy;
AdadeltaOptimizer(double rho, double epsilon, double decay, BaseLr *lr)
: ParameterOptimizer<T>(lr), rho(rho), epsilon(epsilon), decay(decay) {}
~AdadeltaOptimizer() {
if (accum_gradient) delete accum_gradient;
if (accum_delta) delete accum_delta;
......
......@@ -3,11 +3,6 @@
namespace paddle {
namespace optimizer {
template <class T>
AdagradOptimizer<T>::AdagradOptimizer(const ::paddle::OptimizerConfig& config)
: ParameterOptimizer<T>(config) {
epsilon = config.adagrad().epsilon();
decay = config.adagrad().decay();
}
template <class T>
void AdagradOptimizer<T>::set_weight(const Tensor<T>* p) {
......
......@@ -9,7 +9,11 @@ namespace optimizer {
template <class T>
class AdagradOptimizer : public ParameterOptimizer<T> {
public:
AdagradOptimizer(const OptimizerConfig &config);
using ParameterOptimizer<T>::parameter_;
using ParameterOptimizer<T>::num_sample_passed;
using ParameterOptimizer<T>::lr_policy;
AdagradOptimizer(double epsilon, double decay, BaseLr *lr)
: ParameterOptimizer<T>(lr), epsilon(epsilon), decay(decay) {}
~AdagradOptimizer() {
if (accum_gradient) delete accum_gradient;
}
......
......@@ -2,14 +2,6 @@
namespace paddle {
namespace optimizer {
template <class T>
AdamOptimizer<T>::AdamOptimizer(const ::paddle::OptimizerConfig &config)
: ParameterOptimizer<T>(config) {
beta_1 = config.adam().beta_1();
beta_2 = config.adam().beta_2();
epsilon = config.adam().epsilon();
decay = config.adam().decay();
}
template <class T>
void AdamOptimizer<T>::set_weight(const Tensor<T> *p) {
......@@ -23,11 +15,16 @@ void AdamOptimizer<T>::set_weight(const Tensor<T> *p) {
template <class T>
void AdamOptimizer<T>::update(const Tensor<T> &gradient) {
num_sample_passed += 1;
double learning_rate = lr_policy->get_learning_rate();
for (size_t i = 0; i < parameter_.size(); ++i) {
accum_gradient[i] += gradient[i] * gradient[i];
parameter_[i] +=
learning_rate * (gradient[i] / std::sqrt(accum_gradient[i] + epsilon) +
double learning_rate = lr_policy->get_learning_rate(num_sample_passed);
double coef1 = 1.0 - std::pow(beta_1, num_sample_passed);
double coef2 = 1.0 - std::pow(beta_2, num_sample_passed);
learning_rate *= std::sqrt(coef2) / coef1;
for (size_t i = 0; i < parameter_->size(); ++i) {
momentums_[i] = beta_1 * momentums_[i] + (1.0 - beta_1) * gradient[i];
velocitys_[i] =
beta_2 * velocitys_[i] + (1.0 - beta_2) * gradient[i] * gradient[i];
parameter_[i] -=
learning_rate * (momentums_[i] / std::sqrt(velocitys_[i] + epsilon) +
decay * parameter_[i]);
}
}
......
......@@ -9,8 +9,20 @@ namespace optimizer {
template <class T>
class AdamOptimizer : public ParameterOptimizer<T> {
public:
AdamOptimizer(const OptimizerConfig &config);
~AdamOptimizer() {}
using ParameterOptimizer<T>::parameter_;
using ParameterOptimizer<T>::num_sample_passed;
using ParameterOptimizer<T>::lr_policy;
AdamOptimizer(
double beta_1, double beta_2, double epsilon, double decay, BaseLr *lr)
: ParameterOptimizer<T>(lr),
beta_1(beta_1),
beta_2(beta_2),
epsilon(epsilon),
decay(decay) {}
~AdamOptimizer() {
if (momentums_) delete momentums_;
if (velocitys_) delete velocitys_;
}
void update(const Tensor<T> &gradient);
void set_weight(const Tensor<T> *p);
T *get_weight() const;
......
#ifndef PADDLE_OPTIMIZER_LR_POLICY_H_
#define PADDLE_OPTIMIZER_LR_POLICY_H_
#include "OptimizerConfig.ph.h"
#include "OptimizerConfig.pb.h"
namespace paddle {
namespace optimizer {
class BaseLr {
public:
LrPolicyBase(const OpitmizerConfig &config) {
learning_rate = config.lr_config().learning_rate();
}
BaseLr(double lr) : learning_rate(lr) {}
virtual ~BaseLr() {}
virtual double get_learning_rate(const uint64_t num_sample_passed) = 0;
private:
protected:
double learning_rate;
};
......
......@@ -3,7 +3,7 @@
#include "parameter_optimizer.h"
template <class T>
template <paddle_element_type T>
struct EnumToType {};
template <class T>
......@@ -11,15 +11,14 @@ struct TypeToEnum {};
#define MATCH_ENUM_TYPE(TYPE, ENUM) \
template <> \
struct TypeToEnum<ENUM> { \
struct TypeToEnum<TYPE> { \
static paddle_element_type v() { return ENUM; }; \
static constexpr TYPE value = ENUM;
}
;
template <>
struct EnumToType<ENUM> {
typedef TYPE Type;
}
static constexpr TYPE value = ENUM; \
}; \
template <> \
struct EnumToType<ENUM> { \
typedef TYPE Type; \
}
MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32);
MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32);
......@@ -27,11 +26,10 @@ MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64);
MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64);
MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32);
MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64);
struct paddle_optimizer {
 struct paddle_optimizer {
/*! \brief optmizer in C++ side */
paddle::optimizer::ParameterOptimzier* impl;
paddle::optimizer::ParameterOptimizerBase* impl;
};
paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
......@@ -48,7 +46,7 @@ int paddle_release_optimizer(paddle_optimizer* o) {
}
int paddle_update_parameter(paddle_optimizer* o,
paddle_element_type data_type,
const paddle_element_type data_type,
const void* grad_buffer,
int num_bytes) {
auto type = EnumToType<data_type>::Type;
......@@ -59,7 +57,7 @@ int paddle_update_parameter(paddle_optimizer* o,
}
int paddle_optimizer_set_weights(paddle_optimizer* o,
paddle_element_type data_type,
const paddle_element_type data_type,
void* param_buffer,
int num_bytes) {
auto type = EnumToType<data_type>::Type;
......
......@@ -64,7 +64,7 @@ int paddle_release_optimizer(paddle_optimizer* o);
* @return return exec status
*/
int paddle_update_parameter(paddle_optimizer* o,
paddle_element_type data_type,
const paddle_element_type data_type,
const void* gradient,
int num_bytes);
......@@ -76,7 +76,7 @@ int paddle_update_parameter(paddle_optimizer* o,
* @return return exec status
*/
int paddle_optimizer_set_weights(paddle_optimizer* o,
paddle_element_type data_type,
const paddle_element_type data_type,
void* param_buffer,
int num_bytes);
......
#include "parameter_optimizer.h"
#include <glog/logging.h>
#include "optimizer_factory.h"
#include "adadelta_optimizer.h"
#include "adagrad_optimizer.h"
#include "adam_optimizer.h"
#include "lr_policy.h"
#include "sgd_optimizer.h"
#include "parameter_optimizer.h"
namespace paddle {
namespace optimizer {
......@@ -12,29 +17,40 @@ ParameterOptimizer<T> *ParameterOptimizer<T>::create(
CHECK(config.ParseFromString(config_proto) == 0)
<< "error : optimizer config";
CHECK(config_valid(config) == 0) << "error : invalid optimizer config ";
BaseLr *lr = nullptr;
switch (config.lr_policy()) {
case "ConstLr":
lr = new ConstLr(config.lr_config().learning_rate());
break;
}
ParameterOptimizer<T> *opt = nullptr;
switch (config.optimizer_name()) {
case "SGD":
opt = new SGDOptimizer<T>(config);
opt = new SGDOptimizer<T>(config.sgd().momentum(),
config.sgd().decay(),
config.sgd().nesterov(),
lr);
break;
case "Adagrad":
opt = new AdagradOptimizer<T>(config);
opt = new AdagradOptimizer<T>(
config.adagrad().epsilon(), config.adagrad().decay(), lr);
break;
case "Adadelta":
opt = new AdadeltaOptimizer<T>(config);
opt = new AdadeltaOptimizer<T>(config.adadelta().rho(),
config.adadelta().epsilon(),
config.adadelta().decay(),
lr);
break;
case "Adam":
opt = new AdamOptimizer<T>(config);
opt = new AdamOptimizer<T>(config.adam().beta_1(),
config.adam().beta_2(),
config.adam().epsilon(),
config.adam().decay(),
lr);
break;
default:
opt = new SGDOptimizer<T>(config);
}
switch (config.lr_policy()) {
case "ConstLr":
opt.lr_policy = new ConstLr(config);
break;
}
return opt;
}
......
......@@ -11,6 +11,12 @@
namespace paddle {
namespace optimizer {
class ParameterOptimizerBase {
private:
ParameterOptimizerBase(const ParameterOptimizerBase &) = delete;
ParameterOptimizerBase &operator=(const ParameterOptimizerBase &) = delete;
};
template <class T>
class ParameterOptimizer {
public:
......@@ -18,18 +24,18 @@ public:
* @brief update hook for algorithm need to traverse parameter more than
* once.
*/
// use config for pack trainig state
ParameterOptimizer(const OptimizerConfig &config) : config_(config){};
ParameterOptimizer(BaseLr *lr) : lr_policy(lr), num_sample_passed(0) {}
virtual ~ParameterOptimizer() { delete parameter_; };
static ParameterOptimizer *create(const ::std::string &config_proto);
virtual void update(const Tensor &gradient) = 0;
virtual void destroy() = 0;
virtual void update(const Tensor<T> &gradient) = 0;
virtual T *get_weight() const;
virtual void set_weight(const Tensor<T> *parameter);
// package optimizer config proto in runtime for saving checkpoint
virtual char *get_config_proto();
~ParameterOptimzier() { delete parameter_; }
private:
public:
bool config_valid(::std::string &config) const;
OptimizerConfig config_;
Tensor<T> *parameter_;
......@@ -37,12 +43,6 @@ private:
// learning rate policy
BaseLr *lr_policy;
uint64_t num_sample_passed;
ParameterOptimizer(const ParameterOptimizer &) = delete;
ParameterOptimizer &operator=(const ParameterOptimizer &) = delete;
/**
* @brief indicate if use L1, L2 regularizer
*/
};
} // namespace optimizer
......
......@@ -19,6 +19,8 @@ Regularizer<T>* Regularizer<T>::create(const std::string& config) {
template class L1Regularizer<float>;
template class L1Regularizer<double>;
template class L2Regularizer<float>;
template class L2Regularizer<double>;
} // namespace optimizer
} // namespace paddle
......@@ -9,8 +9,18 @@ namespace optimizer {
template <class T>
class SGDOptimizer : public ParameterOptimizer<T> {
public:
SGDOptimizer(const ::paddle::OptimizerConfig& config);
~SGDOptimizer() {
using ParameterOptimizer<T>::parameter_;
using ParameterOptimizer<T>::num_sample_passed;
using ParameterOptimizer<T>::lr_policy;
SGDOptimizer(double m,
double d,
bool n,
double learning_rate,
uint64_t num_sample_passed,
BaseLr* lr)
: ParameterOptimizer<T>(lr), momentum(m), decay(d), nesterov(n) {}
virtual ~SGDOptimizer() {
// clear memory by Tensor library
delete momentums_;
}
......@@ -18,7 +28,6 @@ public:
void set_weight(const Tensor<T>* p);
T* get_weight() const;
char* get_config_proto();
private:
Tensor<T>* momentums_;
......
......@@ -3,18 +3,10 @@
namespace paddle {
namespace optimizer {
template <class T>
SGDOptimizer<T>::SGDOptimizer(const ::paddle::OptimizerConfig &config)
: ParameterOptimizer<T>(config) {
momentum = config.sgd().momentum();
decay = config.sgd().decay();
nesterov = config.sgd().nesterov();
}
template <class T>
void SGDOptimizer<T>::set_weight(const Tensor<T> *p) {
// ParameterOptimizer::set_weight(p);
size_t size = p->width();
size_t size = p->size();
// TODO: fix it with align aware allocator bind to Tensor
if (momentum != 0.0) {
T *ptr = new T[size];
......@@ -27,7 +19,7 @@ void SGDOptimizer<T>::update(const Tensor<T> &gradient) {
num_sample_passed += 1;
double learning_rate = lr_policy->get_learning_rate(num_sample_passed);
double velocity = 0.0;
for (size_t i = 0; i < parameter_.size(); ++i) {
Tensor<T> &for (size_t i = 0; i < parameter_->size(); ++i) {
if (momentum == 0.0) {
velocity =
-learning_rate * gradient[i] - learning_rate * decay * parameter_[i];
......@@ -44,15 +36,6 @@ void SGDOptimizer<T>::update(const Tensor<T> &gradient) {
}
}
template <class T>
char *SGDOptimizer<T>::get_config_proto() {
ParameterOptimizer::get_config_proto();
config.set_learning_rate(learning_rate);
config.set_decay(decay);
config.set_nesterov(nesterov);
return config.SerializeAsString().c_str();
}
template class SGDOptimizer<float>;
template class SGDOptimizer<double>;
......
......@@ -12,7 +12,7 @@ message SGDConfig {
optional double momentum = 21 [default = 0.0];
optional double decay = 23 [default = 0.0];
optional bool nesterov =24 [default = false];
}
message AdadeltaConfig {
......@@ -95,5 +95,4 @@ message OptimizerConfig {
// common config of optimizer
optional double clipnorm = 101;
optional double clipvalue = 102;
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册