提交 e4567962 编写于 作者: D dzhwinter

"update with comment"

上级 f5ff2838
......@@ -5,14 +5,6 @@
namespace paddle {
namespace optimizer {
void AdadeltaOptimizer::set_weight(Tensor* p) {
parameter_ = p;
size_t size = p->size();
accum_gradient_ = new Tensor(size);
accum_delta_ = new Tensor(size);
update_delta_ = new Tensor(size);
}
void AdadeltaOptimizer::Update(const Tensor* gradient) {
num_sample_passed_ += 1;
double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
......@@ -32,5 +24,39 @@ void AdadeltaOptimizer::Update(const Tensor* gradient) {
param[i] -= learning_rate * update_d[i] + learning_rate * decay_ * param[i];
}
}
const char* AdadeltaOptimizer::SerializeState(int* state_len) {
OptimizerState state;
state.set_learning_rate(lr_policy_->LearningRate(num_sample_passed_));
state.set_num_sample_passed(num_sample_passed_);
TensorToProto(*parameter_, state.mutable_parameter());
TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
TensorToProto(*accum_delta_, state.mutable_accum_delta());
TensorToProto(*update_delta_, state.mutable_update_delta());
state.set_nesterov(epsilon_);
state.set_momentum(rho_);
state.set_decay(decay_);
*state_len += CalStateSize(parameter_,
accum_gradient_,
accum_delta_,
update_delta_,
rho_,
epsilon_,
decay_);
return state.SerializeAsString().c_str();
}
void AdadeltaOptimizer::DeSerializeState(const std::string& str) {
OptimizerState state;
state.ParseFromString(str);
lr_policy_->set(state.learning_rate());
num_sample_passed_ = state.num_sample_passed();
ProtoToTensor(state.parameter(), parameter_);
ProtoToTensor(state.accum_gradient(), accum_gradient_);
ProtoToTensor(state.accum_delta(), accum_delta_);
ProtoToTensor(state.update_delta(), update_delta_);
} // namespace optimizer
} // namespace optimizer
} // namespace paddle
......@@ -7,21 +7,31 @@ namespace optimizer {
class AdadeltaOptimizer : public ParameterOptimizer {
public:
AdadeltaOptimizer(double rho, double epsilon, double decay, LrPolicy *lr)
: ParameterOptimizer(lr),
AdadeltaOptimizer(
Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
: ParameterOptimizer(parameter, lr),
accum_gradient_(nullptr),
accum_delta_(nullptr),
update_delta_(nullptr),
rho_(rho),
epsilon_(epsilon),
decay_(decay) {}
decay_(decay) {
size_t size = p->size();
if (accum_gradient_) delete accum_gradient_;
accum_gradient_ = new Tensor(size);
if (accum_delta_) delete accum_delta_;
accum_delta_ = new Tensor(size);
if (update_delta_) delete update_delta_;
update_delta_ = new Tensor(size);
}
~AdadeltaOptimizer() {
if (accum_gradient_) delete accum_gradient_;
if (accum_delta_) delete accum_delta_;
if (update_delta_) delete update_delta_;
}
void Update(const Tensor *gradient);
void set_weight(Tensor *p);
const char *SerializeState(int *state_len);
void DeSerializeState(const std::string &state);
private:
Tensor *accum_gradient_;
......
......@@ -5,12 +5,6 @@
namespace paddle {
namespace optimizer {
void AdagradOptimizer::set_weight(Tensor* p) {
parameter_ = p;
size_t size = p->size();
accum_gradient_ = new Tensor(size);
}
void AdagradOptimizer::Update(const Tensor* gradient) {
num_sample_passed_ += 1;
double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
......@@ -23,6 +17,8 @@ void AdagradOptimizer::Update(const Tensor* gradient) {
learning_rate * decay_ * param[i];
}
}
const char* SGDOptimizer::SerializeState(int* state_len) { NIMPL; }
void SGDOptimizer::DeSerializeState(const std::string& str) { NIMPL; }
// namespace optimizer
} // namespace optimizer
} // namespace paddle
......@@ -7,16 +7,19 @@ namespace optimizer {
class AdagradOptimizer : public ParameterOptimizer {
public:
AdagradOptimizer(double epsilon, double decay, LrPolicy *lr)
: ParameterOptimizer(lr),
accum_gradient_(nullptr),
epsilon_(epsilon),
decay_(decay) {}
AdagradOptimizer(Tensor *parameter,
LrPolicy *lr,
double epsilon,
double decay)
: ParameterOptimizer(parameter, lr), epsilon_(epsilon), decay_(decay) {
size_t size = p->size();
if (accum_gradient_) delete accum_gradient_;
accum_gradient_ = new Tensor(size);
}
~AdagradOptimizer() {
if (accum_gradient_) delete accum_gradient_;
}
void Update(const Tensor *gradient);
void set_weight(Tensor *p);
private:
Tensor *accum_gradient_;
......
......@@ -28,5 +28,33 @@ void AdamOptimizer::Update(const Tensor *gradient) {
learning_rate * (m[i] / std::sqrt(v[i] + epsilon_) + decay_ * param[i]);
}
}
const char *AdadeltaOptimizer::SerializeState(int *state_len) {
OptimizerState state;
state.set_learning_rate(lr_policy_->LearningRate(num_sample_passed_));
state.set_num_sample_passed(num_sample_passed_);
TensorToProto(*parameter_, state.mutable_parameter());
TensorToProto(*velocitys_, state.mutable_momentums());
state.set_beta_1(beta_1_);
state.set_beta_2(beta_2_);
state.set_decay(decay_);
*state_len += CalStateSize(
parameter_, momentums_, velocitys_, beta_1_, beta_2, epsilon_ decay_);
return state.SerializeAsString().c_str();
}
void AdadeltaOptimizer::DeSerializeState(const std::string &str) {
OptimizerState state;
state.ParseFromString(str);
lr_policy_->set(state.learning_rate());
num_sample_passed_ = state.num_sample_passed();
ProtoToTensor(state.parameter(), parameter_);
ProtoToTensor(state.velocitys(), velocitys__);
beta_1_ = state.beta_1();
beta_2_ = state.beta_2();
}
} // namespace optimizer
} // namespace paddle
......@@ -7,9 +7,12 @@ namespace optimizer {
class AdamOptimizer : public ParameterOptimizer {
public:
AdamOptimizer(
double beta_1, double beta_2, double epsilon, double decay, LrPolicy *lr)
: ParameterOptimizer(lr),
AdamOptimizer(Tensor *parameter,
LrPolicy *lr double beta_1,
double beta_2,
double epsilon,
double decay)
: ParameterOptimizer(parameter, lr),
momentums_(nullptr),
velocitys_(nullptr),
beta_1_(beta_1),
......
......@@ -10,6 +10,7 @@ class LrPolicy {
public:
virtual ~LrPolicy() {}
virtual double LearningRate(const uint64_t num_sample_passed) = 0;
virtual void set(double current_learning_rate) = 0;
};
// constant learning rate policy
......@@ -19,6 +20,9 @@ public:
double LearningRate(const uint64_t num_sample_passed) {
return learning_rate;
}
void set(double current_learning_rate) {
learning_rate = current_learning_rate;
}
private:
double learning_rate;
......@@ -31,6 +35,9 @@ public:
double LearningRate(const uint64_t num_sample_passed) {
return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b);
}
void set(double current_learning_rate) {
learning_rate = current_learning_rate;
}
private:
double learning_rate;
......
......@@ -44,13 +44,13 @@ paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
const int state_len) {
paddle_optimizer* optimizer = new paddle_optimizer;
std::string config(config_proto, config_proto + config_proto_len);
optimizer->impl = ParameterOptimizer::Create(config);
Tensor* parameter =
new Tensor(reinterpret_cast<float*>(param_buffer), num_bytes);
optimizer->impl = ParameterOptimizer::Create(config, parameter);
if (state != nullptr) {
std::string s(state, state + state_len);
optimizer->impl->DeSerializeState(s);
}
Tensor* param = new Tensor(reinterpret_cast<float*>(param_buffer), num_bytes);
optimizer->impl->set_weight(param);
return optimizer;
}
......@@ -77,6 +77,7 @@ int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer) {
}
int paddle_optimizer_get_state(paddle_optimizer* o, const char** state) {
*state = o->impl->SerializeState();
return strlen(*state);
int state_len = 0;
*state = o->impl->SerializeState(&state_len);
return state_len;
}
......@@ -10,8 +10,8 @@
namespace paddle {
namespace optimizer {
ParameterOptimizer *ParameterOptimizer::Create(
const std::string &config_proto) {
ParameterOptimizer *ParameterOptimizer::Create(const std::string &config_proto,
Tensor *parameter) {
paddle::OptimizerConfig config;
CHECK(config.ParseFromString(config_proto) == 0)
<< "failed parse optimizer config";
......@@ -29,34 +29,38 @@ ParameterOptimizer *ParameterOptimizer::Create(
};
LrPolicy *lr = select_lr_policy(config);
auto select_optimizer =
[=](const OptimizerConfig &config) -> ParameterOptimizer * {
[=](Tensor *parameter,
const OptimizerConfig &config) -> ParameterOptimizer * {
if (config.optimizer() == OptimizerConfig::SGD) {
return new SGDOptimizer(config.sgd().momentum(),
return new SGDOptimizer(parameter,
lr,
config.sgd().momentum(),
config.sgd().decay(),
config.sgd().nesterov(),
lr);
config.sgd().nesterov());
}
if (config.optimizer() == OptimizerConfig::Adadelta) {
return new AdadeltaOptimizer(config.adadelta().rho(),
return new AdadeltaOptimizer(parameter,
lr,
config.adadelta().rho(),
config.adadelta().epsilon(),
config.adadelta().decay(),
lr);
config.adadelta().decay());
}
if (config.optimizer() == OptimizerConfig::Adagrad) {
return new AdagradOptimizer(
config.adagrad().epsilon(), config.adagrad().decay(), lr);
parameter, lr, config.adagrad().epsilon(), config.adagrad().decay());
}
if (config.optimizer() == OptimizerConfig::Adam) {
return new AdamOptimizer(config.adam().beta_1(),
return new AdamOptimizer(parameter,
lr,
config.adam().beta_1(),
config.adam().beta_2(),
config.adam().epsilon(),
config.adam().decay(),
lr);
config.adam().decay());
}
// default
LOG(WARNING)
<< "have not select any Optimizer. use SGDOptimizer in default";
return new SGDOptimizer(0.0, 0.0, false, lr);
return new SGDOptimizer(parameter, lr, 0.0, 0.0, false);
};
return select_optimizer(config);
}
......@@ -66,7 +70,5 @@ float *ParameterOptimizer::get_weight(int *param_size) const {
return parameter_->get_buffer();
}
void ParameterOptimizer::set_weight(Tensor *p) { parameter_ = p; }
} // namespace optimizer
} // namespace paddle
......@@ -5,32 +5,35 @@
#include <string>
#include "OptimizerConfig.pb.h"
#include "lr_policy.h"
#include "serialization.h"
#include "tensor.h"
// Not Implemen Yet, macr
// o
#define NIMPL crash(__PRETTY_FUNCTION__, " not implemented yet")
namespace paddle {
namespace optimizer {
const std::string kOptimizerVersion = "1.0";
class ParameterOptimizer {
public:
/**
* @brief update hook for algorithm need to traverse parameter more than
* once.
*/
ParameterOptimizer(LrPolicy *lr) : lr_policy_(lr), num_sample_passed_(0) {}
ParameterOptimizer(Tensor *parameter, LrPolicy *lr)
: parameter_(parameter), lr_policy_(lr), num_sample_passed_(0) {}
virtual ~ParameterOptimizer() { delete parameter_; };
static ParameterOptimizer *Create(const std::string &config_proto);
virtual const char *SerializeState();
virtual void DeSerializeState(const std::string &state);
static ParameterOptimizer *Create(const std::string &config_proto,
Tensor *parameter);
virtual void Update(const Tensor *gradient) = 0;
virtual float *get_weight(int *param_size) const;
virtual void set_weight(Tensor *parameter);
virtual const char *SerializeState(int *state_len) = 0;
virtual void DeSerializeState(const std::string &state) = 0;
protected:
Tensor *parameter_;
// learning rate policy
LrPolicy *lr_policy_;
uint64_t num_sample_passed_;
......
......@@ -2,6 +2,7 @@
#include <sstream>
#include <string>
#include <type_traits>
#include "OptimizerConfig.pb.h"
#include "paddle/utils/Logging.h"
#include "tensor.h"
......@@ -9,6 +10,17 @@
namespace paddle {
namespace optimizer {
inline unsigned CalStateSize(int* state_len) { return 0; }
template <typename HEAD, typename... TAIL>
unsigned CalStateSize(const HEAD& head, const TAIL&... tail) {
if (std::is_fundamental<HEAD>::value) {
return sizeof head + CalStateSize(tail...);
} else {
return sizeof(head[0] * head->size()) + CalStateSize(tail...);
}
}
static void TensorToProto(const Tensor& tensor, TensorProto* proto) {
proto->set_data_type(TensorProto::PADDLE_ELEMENT_TYPE_FLOAT32);
proto->set_size(tensor.size());
......
......@@ -7,20 +7,26 @@ namespace optimizer {
class SGDOptimizer : public ParameterOptimizer {
public:
SGDOptimizer(double m, double d, bool n, LrPolicy* lr)
: ParameterOptimizer(lr),
SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n)
: ParameterOptimizer(parameter, lr),
momentums_(nullptr),
momentum_(m),
decay_(d),
nesterov_(n) {}
virtual ~SGDOptimizer() { delete momentums_; }
nesterov_(n) {
if (momentum_ != 0.0) {
size_t size = p->size();
// TODO: fix it with align aware allocator bind to Tensor
if (momentums_) delete momentums_;
momentums_ = new Tensor(size);
}
}
virtual ~SGDOptimizer() {
if (momentums_) delete momentums_;
}
void Update(const Tensor* gradient);
const char* SerializeState();
const char* SerializeState(int* state_len);
void DeSerializeState(const std::string& state);
void set_weight(Tensor* p);
float* get_weight(int* param_size) const;
private:
Tensor* momentums_;
double momentum_;
......
......@@ -4,15 +4,6 @@
namespace paddle {
namespace optimizer {
void SGDOptimizer::set_weight(Tensor *p) {
parameter_ = p;
if (momentum_ != 0.0) {
size_t size = p->size();
// TODO: fix it with align aware allocator bind to Tensor
momentums_ = new Tensor(size);
}
}
void SGDOptimizer::Update(const Tensor *gradient) {
num_sample_passed_ += 1;
double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
......@@ -36,28 +27,30 @@ void SGDOptimizer::Update(const Tensor *gradient) {
}
}
const char *SGDOptimizer::SerializeState() {
const char *SGDOptimizer::SerializeState(int *state_len) {
OptimizerState state;
// version is a global const value
state.set_version(kOptimizerVersion);
TensorToProto(*parameter_, state.add_data());
TensorToProto(*momentums_, state.add_data());
state.add_hyperparam(momentum_);
state.set_learning_rate(lr_policy_->LearningRate(num_sample_passed_));
state.set_num_sample_passed(num_sample_passed_);
TensorToProto(*parameter_, state.mutable_parameter());
TensorToProto(*momentums_, state.mutable_momentums());
state.set_momentum(momentum_);
state.set_decay(decay_);
state.set_nesterov(nesterov_);
*state_len +=
CalStateSize(parameter_, momentums_, momentum_, decay_, nesterov_);
return state.SerializeAsString().c_str();
}
void SGDOptimizer::DeSerializeState(const std::string &str) {
OptimizerState state;
state.ParseFromString(str);
CHECK(state.version() == kOptimizerVersion)
<< "error version of state"
<< "expected : " << kOptimizerVersion << "get : " << state.version();
lr_policy_->set(state.learning_rate());
num_sample_passed_ = state.num_sample_passed();
ProtoToTensor(state.data(0), parameter_);
if (state.data_size() == 2) {
ProtoToTensor(state.data(1), momentums_);
momentum_ = state.hyperparam(0);
}
ProtoToTensor(state.parameter(), parameter_);
ProtoToTensor(state.parameter(), momentums_);
momentum_ = state.momentum();
}
} // namespace optimizer
......
......@@ -5,13 +5,14 @@ option optimize_for = LITE_RUNTIME;
package paddle;
message SGDConfig {
// SGD
// SGD
// momentum: float >= 0. Parameter updates momentum.
// decay: float >= 0. Learning rate decay over each update.
// nesterov: boolean. Whether to apply Nesterov momentum.
optional double momentum = 21 [default = 0.0];
optional double decay = 23 [default = 0.0];
optional bool nesterov =24 [default = false];
}
......@@ -75,14 +76,38 @@ enum DataType {
}
required DataType data_type = 1;
repeated bytes content = 2;
optional uint64 size = 3;
}
message OptimizerState {
// match old training state with format parser
required string version = 100;
repeated TensorProto data = 1;
repeated double hyperparam = 3;
optional double learning_rate = 101;
optional double lr_decay_a = 102;
optional double lr_decay_b = 103;
optional double num_sample_passed = 104;
// momentum
optional TensorProto parameter = 105;
optional TensorProto momentums = 1;
// adadelta
optional TensorProto accum_gradient = 2;
optional TensorProto accum_delta = 3;
optional TensorProto update_delta = 4;
// adam
optional TensorProto velocitys = 5;
// momentum
optional double momentum = 6;
optional double decay = 7;
optional bool nesterov = 8;
// adadelta
optional double rho = 9;
optional double epsilon = 10;
// adam
optional double beta_1 = 11;
optional double beta_2 = 12;
}
message OptimizerConfig {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册