提交 26e9c4e2 编写于 作者: D dzhwinter

"add vector alias to make name clear"

上级 b4aa0eca
include_directories(${CMAKE_CURRENT_BINARY_DIR}) include_directories(${CMAKE_CURRENT_BINARY_DIR})
set(OPITMIZER_SRCS set(OPITMIZER_SRCS
# adadelta_optimizer.cc adadelta_optimizer.cc
# adagrad_optimizer.cc adagrad_optimizer.cc
# adam_optimizer.cc adam_optimizer.cc
optimizer.cc optimizer.cc
parameter_optimizer.cc parameter_optimizer.cc
sgd_optmizer.cc sgd_optmizer.cc
regularizer.cc
) )
set(OPITMIZER_Headers set(OPITMIZER_Headers
# adadelta_optimizer.h adadelta_optimizer.h
# adagrad_optimizer.h adagrad_optimizer.h
# adam_optimizer.h adam_optimizer.h
lr_policy.h lr_policy.h
optimizer.h optimizer.h
parameter_optimizer.h parameter_optimizer.h
regularizer.h
sgd_optimizer.h sgd_optimizer.h
Tensor.h Tensor.h
) )
......
#include "adadelta_optimizer.h" #include "adadelta_optimizer.h"
#include <algorithm> #include <algorithm>
#include <cmath>
namespace paddle { namespace paddle {
namespace optimizer { namespace optimizer {
...@@ -7,28 +8,30 @@ namespace optimizer { ...@@ -7,28 +8,30 @@ namespace optimizer {
void AdadeltaOptimizer::set_weight(Tensor* p) { void AdadeltaOptimizer::set_weight(Tensor* p) {
size_t size = p->size(); size_t size = p->size();
real* gptr = new real[size]; real* gptr = new real[size];
accum_gradient = Tensor(gptr, size); accum_gradient = new Tensor(gptr, size);
real* dptr = new real[size]; real* dptr = new real[size];
accum_delta = Tensor(dptr, size); accum_delta = new Tensor(dptr, size);
real* dptr_current = new real[size]; real* dptr_current = new real[size];
update_delta = Tensor(dptr_current, size); update_delta = new Tensor(dptr_current, size);
} }
void AdadeltaOptimizer::update(const Tensor& gradient) { void AdadeltaOptimizer::update(const Tensor* gradient) {
num_sample_passed += 1; num_sample_passed += 1;
double learning_rate = lr_policy->get_learning_rate(num_sample_passed); double learning_rate = lr_policy->get_learning_rate(num_sample_passed);
for (size_t i = 0; i < parameter_->size(); ++i) { Tensor& param = *parameter_;
accum_gradient[i] = const Tensor& grad = *gradient;
rho * accum_gradient[i] + (1.0 - rho) * gradient[i] * gradient[i]; Tensor& accum_g = *accum_gradient;
Tensor& accum_d = *accum_delta;
Tensor& update_d = *update_delta;
for (size_t i = 0; i < param.size(); ++i) {
accum_g[i] = rho * accum_g[i] + (1.0 - rho) * grad[i] * grad[i];
update_delta[i] = std::sqrt(accum_delta[i] + epsilon) / update_d[i] = std::sqrt(accum_d[i] + epsilon) /
std::sqrt(accum_gradient[i] + epsilon) * gradient[i]; std::sqrt(accum_g[i] + epsilon) * grad[i];
accum_delta[i] = accum_d[i] = rho * accum_d[i] + (1.0 - rho) * update_d[i] * update_d[i];
rho * accum_delta[i] + (1.0 - rho) * update_delta[i] * update_delta[i];
parameter_[i] -= param[i] -= learning_rate * update_d[i] + learning_rate * decay * param[i];
learning_rate * update_delta[i] + learning_rate * decay * parameter_[i];
} }
} }
} // namespace optimizer } // namespace optimizer
......
...@@ -19,7 +19,7 @@ public: ...@@ -19,7 +19,7 @@ public:
if (accum_delta) delete accum_delta; if (accum_delta) delete accum_delta;
if (update_delta) delete update_delta; if (update_delta) delete update_delta;
} }
void update(const Tensor &gradient); void update(const Tensor *gradient);
void set_weight(Tensor *p); void set_weight(Tensor *p);
real *get_weight() const; real *get_weight() const;
......
#include <cmath>
#include "adagrad_optimizer.h" #include "adagrad_optimizer.h"
namespace paddle { namespace paddle {
namespace optimizer { namespace optimizer {
void AdagradOptimizer::set_weight(Tensor* p) { void AdagradOptimizer::set_weight(Tensor* p) {
size_t size = p->width(); size_t size = p->size();
real* gptr = new real[size]; real* gptr = new real[size];
accum_gradient = Tensor(gptr, size); accum_gradient = new Tensor(gptr, size);
real* dptr = new real[size];
accum_delta = Tensor(dtpr, size);
real* dptr_current = new real[size];
update_delta = Tensor(dptr_current, size);
} }
void AdagradOptimizer::update(const Tensor& gradient) { void AdagradOptimizer::update(const Tensor* gradient) {
num_sample_passed += 1; num_sample_passed += 1;
double learning_rate = lr_policy->get_learning_rate(); double learning_rate = lr_policy->get_learning_rate(num_sample_passed);
for (size_t i = 0; i < parameter_.size(); ++i) { Tensor& param = *parameter_;
accum_gradient[i] += gradient[i] * gradient[i]; const Tensor& grad = *gradient;
parameter_[i] += Tensor& accum_g = *accum_gradient;
learning_rate * (gradient[i] / std::sqrt(accum_gradient[i] + epsilon) + for (size_t i = 0; i < param.size(); ++i) {
decay * parameter_[i]); accum_g[i] += grad[i] * grad[i];
param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon) +
learning_rate * decay * param[i];
} }
} }
......
...@@ -13,7 +13,7 @@ public: ...@@ -13,7 +13,7 @@ public:
~AdagradOptimizer() { ~AdagradOptimizer() {
if (accum_gradient) delete accum_gradient; if (accum_gradient) delete accum_gradient;
} }
void update(const Tensor &gradient); void update(const Tensor *gradient);
void set_weight(Tensor *p); void set_weight(Tensor *p);
real *get_weight() const; real *get_weight() const;
......
#include "adam_optimizer.h" #include "adam_optimizer.h"
#include <cmath>
namespace paddle { namespace paddle {
namespace optimizer { namespace optimizer {
void AdamOptimizer::set_weight(Tensor *p) { void AdamOptimizer::set_weight(Tensor *p) {
size_t size = p->width(); size_t size = p->size();
real *mptr = new real[size]; real *mptr = new real[size];
momentums_ = Tensor(mptr, size); momentums_ = new Tensor(mptr, size);
real *vptr = new real[size]; real *vptr = new real[size];
velocitys_ = Tensor(vtpr, size); velocitys_ = new Tensor(vptr, size);
} }
void AdamOptimizer::update(const Tensor &gradient) { void AdamOptimizer::update(const Tensor *gradient) {
num_sample_passed += 1; num_sample_passed += 1;
double learning_rate = lr_policy->get_learning_rate(num_sample_passed); double learning_rate = lr_policy->get_learning_rate(num_sample_passed);
double coef1 = 1.0 - std::pow(beta_1, num_sample_passed); double coef1 = 1.0 - std::pow(beta_1, num_sample_passed);
double coef2 = 1.0 - std::pow(beta_2, num_sample_passed); double coef2 = 1.0 - std::pow(beta_2, num_sample_passed);
learning_rate *= std::sqrt(coef2) / coef1; learning_rate *= std::sqrt(coef2) / coef1;
for (size_t i = 0; i < parameter_->size(); ++i) { Tensor &param = *parameter_;
momentums_[i] = beta_1 * momentums_[i] + (1.0 - beta_1) * gradient[i]; const Tensor &grad = *gradient;
velocitys_[i] = Tensor &m = *momentums_;
beta_2 * velocitys_[i] + (1.0 - beta_2) * gradient[i] * gradient[i]; Tensor &v = *velocitys_;
parameter_[i] -= for (size_t i = 0; i < param.size(); ++i) {
learning_rate * (momentums_[i] / std::sqrt(velocitys_[i] + epsilon) + m[i] = beta_1 * m[i] + (1.0 - beta_1) * grad[i];
decay * parameter_[i]); v[i] = beta_2 * v[i] + (1.0 - beta_2) * grad[i] * grad[i];
param[i] -=
learning_rate * (m[i] / std::sqrt(v[i] + epsilon) + decay * param[i]);
} }
} }
} // namespace optimizer } // namespace optimizer
......
...@@ -19,7 +19,7 @@ public: ...@@ -19,7 +19,7 @@ public:
if (momentums_) delete momentums_; if (momentums_) delete momentums_;
if (velocitys_) delete velocitys_; if (velocitys_) delete velocitys_;
} }
void update(const Tensor &gradient); void update(const Tensor *gradient);
void set_weight(Tensor *p); void set_weight(Tensor *p);
real *get_weight() const; real *get_weight() const;
......
...@@ -24,7 +24,7 @@ public: ...@@ -24,7 +24,7 @@ public:
virtual ~ParameterOptimizer() { delete parameter_; }; virtual ~ParameterOptimizer() { delete parameter_; };
static ParameterOptimizer *create(const ::std::string &config_proto); static ParameterOptimizer *create(const ::std::string &config_proto);
virtual void update(const Tensor &gradient) = 0; virtual void update(const Tensor *gradient) = 0;
virtual real *get_weight() const; virtual real *get_weight() const;
virtual void set_weight(Tensor *parameter); virtual void set_weight(Tensor *parameter);
......
...@@ -15,7 +15,7 @@ public: ...@@ -15,7 +15,7 @@ public:
SGDOptimizer(double m, double d, bool n, BaseLr* lr) SGDOptimizer(double m, double d, bool n, BaseLr* lr)
: ParameterOptimizer(lr), momentum(m), decay(d), nesterov(n) {} : ParameterOptimizer(lr), momentum(m), decay(d), nesterov(n) {}
virtual ~SGDOptimizer() { delete momentums_; } virtual ~SGDOptimizer() { delete momentums_; }
void update(const Tensor& gradient); void update(const Tensor* gradient);
void set_weight(Tensor* p); void set_weight(Tensor* p);
real* get_weight() const; real* get_weight() const;
......
...@@ -13,24 +13,25 @@ void SGDOptimizer::set_weight(Tensor *p) { ...@@ -13,24 +13,25 @@ void SGDOptimizer::set_weight(Tensor *p) {
} }
} }
void SGDOptimizer::update(const Tensor &gradient) { void SGDOptimizer::update(const Tensor *gradient) {
num_sample_passed += 1; num_sample_passed += 1;
double learning_rate = lr_policy->get_learning_rate(num_sample_passed); double learning_rate = lr_policy->get_learning_rate(num_sample_passed);
real velocity = 0.0; real velocity = 0.0;
Tensor &param = *parameter_; Tensor &param = *parameter_;
for (size_t i = 0; i < parameter_->size(); ++i) { const Tensor &grad = *gradient;
Tensor &m = *momentums_;
for (size_t i = 0; i < param.size(); ++i) {
if (momentum == 0.0) { if (momentum == 0.0) {
velocity = velocity = -learning_rate * grad[i] - learning_rate * decay * param[i];
-learning_rate * gradient[i] - learning_rate * decay * parameter_[i];
} else { } else {
momentums_[i] = momentum * momentums_[i] - learning_rate * gradient[i] - m[i] = momentum * m[i] - learning_rate * grad[i] -
learning_rate * decay * parameter_[i]; learning_rate * decay * param[i];
velocity = momentums_[i]; velocity = m[i];
} }
if (nesterov) { if (nesterov) {
parameter_[i] += momentum * velocity - learning_rate * gradient[i]; param[i] += momentum * velocity - learning_rate * grad[i];
} else { } else {
parameter_[i] += velocity; param[i] += velocity;
} }
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册