提交 26e9c4e2 编写于 作者: D dzhwinter

"add vector alias to make name clear"

上级 b4aa0eca
include_directories(${CMAKE_CURRENT_BINARY_DIR})
set(OPITMIZER_SRCS
# adadelta_optimizer.cc
# adagrad_optimizer.cc
# adam_optimizer.cc
adadelta_optimizer.cc
adagrad_optimizer.cc
adam_optimizer.cc
optimizer.cc
parameter_optimizer.cc
sgd_optmizer.cc
regularizer.cc
)
set(OPITMIZER_Headers
# adadelta_optimizer.h
# adagrad_optimizer.h
# adam_optimizer.h
adadelta_optimizer.h
adagrad_optimizer.h
adam_optimizer.h
lr_policy.h
optimizer.h
parameter_optimizer.h
regularizer.h
sgd_optimizer.h
Tensor.h
)
......
#include "adadelta_optimizer.h"
#include <algorithm>
#include <cmath>
namespace paddle {
namespace optimizer {
......@@ -7,28 +8,30 @@ namespace optimizer {
void AdadeltaOptimizer::set_weight(Tensor* p) {
size_t size = p->size();
real* gptr = new real[size];
accum_gradient = Tensor(gptr, size);
accum_gradient = new Tensor(gptr, size);
real* dptr = new real[size];
accum_delta = Tensor(dptr, size);
accum_delta = new Tensor(dptr, size);
real* dptr_current = new real[size];
update_delta = Tensor(dptr_current, size);
update_delta = new Tensor(dptr_current, size);
}
void AdadeltaOptimizer::update(const Tensor& gradient) {
void AdadeltaOptimizer::update(const Tensor* gradient) {
num_sample_passed += 1;
double learning_rate = lr_policy->get_learning_rate(num_sample_passed);
for (size_t i = 0; i < parameter_->size(); ++i) {
accum_gradient[i] =
rho * accum_gradient[i] + (1.0 - rho) * gradient[i] * gradient[i];
Tensor& param = *parameter_;
const Tensor& grad = *gradient;
Tensor& accum_g = *accum_gradient;
Tensor& accum_d = *accum_delta;
Tensor& update_d = *update_delta;
for (size_t i = 0; i < param.size(); ++i) {
accum_g[i] = rho * accum_g[i] + (1.0 - rho) * grad[i] * grad[i];
update_delta[i] = std::sqrt(accum_delta[i] + epsilon) /
std::sqrt(accum_gradient[i] + epsilon) * gradient[i];
update_d[i] = std::sqrt(accum_d[i] + epsilon) /
std::sqrt(accum_g[i] + epsilon) * grad[i];
accum_delta[i] =
rho * accum_delta[i] + (1.0 - rho) * update_delta[i] * update_delta[i];
accum_d[i] = rho * accum_d[i] + (1.0 - rho) * update_d[i] * update_d[i];
parameter_[i] -=
learning_rate * update_delta[i] + learning_rate * decay * parameter_[i];
param[i] -= learning_rate * update_d[i] + learning_rate * decay * param[i];
}
}
} // namespace optimizer
......
......@@ -19,7 +19,7 @@ public:
if (accum_delta) delete accum_delta;
if (update_delta) delete update_delta;
}
void update(const Tensor &gradient);
void update(const Tensor *gradient);
void set_weight(Tensor *p);
real *get_weight() const;
......
#include <cmath>
#include "adagrad_optimizer.h"
namespace paddle {
namespace optimizer {
void AdagradOptimizer::set_weight(Tensor* p) {
size_t size = p->width();
size_t size = p->size();
real* gptr = new real[size];
accum_gradient = Tensor(gptr, size);
real* dptr = new real[size];
accum_delta = Tensor(dtpr, size);
real* dptr_current = new real[size];
update_delta = Tensor(dptr_current, size);
accum_gradient = new Tensor(gptr, size);
}
void AdagradOptimizer::update(const Tensor& gradient) {
void AdagradOptimizer::update(const Tensor* gradient) {
num_sample_passed += 1;
double learning_rate = lr_policy->get_learning_rate();
for (size_t i = 0; i < parameter_.size(); ++i) {
accum_gradient[i] += gradient[i] * gradient[i];
parameter_[i] +=
learning_rate * (gradient[i] / std::sqrt(accum_gradient[i] + epsilon) +
decay * parameter_[i]);
double learning_rate = lr_policy->get_learning_rate(num_sample_passed);
Tensor& param = *parameter_;
const Tensor& grad = *gradient;
Tensor& accum_g = *accum_gradient;
for (size_t i = 0; i < param.size(); ++i) {
accum_g[i] += grad[i] * grad[i];
param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon) +
learning_rate * decay * param[i];
}
}
......
......@@ -13,7 +13,7 @@ public:
~AdagradOptimizer() {
if (accum_gradient) delete accum_gradient;
}
void update(const Tensor &gradient);
void update(const Tensor *gradient);
void set_weight(Tensor *p);
real *get_weight() const;
......
#include "adam_optimizer.h"
#include <cmath>
namespace paddle {
namespace optimizer {
void AdamOptimizer::set_weight(Tensor *p) {
size_t size = p->width();
size_t size = p->size();
real *mptr = new real[size];
momentums_ = Tensor(mptr, size);
momentums_ = new Tensor(mptr, size);
real *vptr = new real[size];
velocitys_ = Tensor(vtpr, size);
velocitys_ = new Tensor(vptr, size);
}
void AdamOptimizer::update(const Tensor &gradient) {
void AdamOptimizer::update(const Tensor *gradient) {
num_sample_passed += 1;
double learning_rate = lr_policy->get_learning_rate(num_sample_passed);
double coef1 = 1.0 - std::pow(beta_1, num_sample_passed);
double coef2 = 1.0 - std::pow(beta_2, num_sample_passed);
learning_rate *= std::sqrt(coef2) / coef1;
for (size_t i = 0; i < parameter_->size(); ++i) {
momentums_[i] = beta_1 * momentums_[i] + (1.0 - beta_1) * gradient[i];
velocitys_[i] =
beta_2 * velocitys_[i] + (1.0 - beta_2) * gradient[i] * gradient[i];
parameter_[i] -=
learning_rate * (momentums_[i] / std::sqrt(velocitys_[i] + epsilon) +
decay * parameter_[i]);
Tensor &param = *parameter_;
const Tensor &grad = *gradient;
Tensor &m = *momentums_;
Tensor &v = *velocitys_;
for (size_t i = 0; i < param.size(); ++i) {
m[i] = beta_1 * m[i] + (1.0 - beta_1) * grad[i];
v[i] = beta_2 * v[i] + (1.0 - beta_2) * grad[i] * grad[i];
param[i] -=
learning_rate * (m[i] / std::sqrt(v[i] + epsilon) + decay * param[i]);
}
}
} // namespace optimizer
......
......@@ -19,7 +19,7 @@ public:
if (momentums_) delete momentums_;
if (velocitys_) delete velocitys_;
}
void update(const Tensor &gradient);
void update(const Tensor *gradient);
void set_weight(Tensor *p);
real *get_weight() const;
......
......@@ -24,7 +24,7 @@ public:
virtual ~ParameterOptimizer() { delete parameter_; };
static ParameterOptimizer *create(const ::std::string &config_proto);
virtual void update(const Tensor &gradient) = 0;
virtual void update(const Tensor *gradient) = 0;
virtual real *get_weight() const;
virtual void set_weight(Tensor *parameter);
......
......@@ -15,7 +15,7 @@ public:
SGDOptimizer(double m, double d, bool n, BaseLr* lr)
: ParameterOptimizer(lr), momentum(m), decay(d), nesterov(n) {}
virtual ~SGDOptimizer() { delete momentums_; }
void update(const Tensor& gradient);
void update(const Tensor* gradient);
void set_weight(Tensor* p);
real* get_weight() const;
......
......@@ -13,24 +13,25 @@ void SGDOptimizer::set_weight(Tensor *p) {
}
}
void SGDOptimizer::update(const Tensor &gradient) {
void SGDOptimizer::update(const Tensor *gradient) {
num_sample_passed += 1;
double learning_rate = lr_policy->get_learning_rate(num_sample_passed);
real velocity = 0.0;
Tensor &param = *parameter_;
for (size_t i = 0; i < parameter_->size(); ++i) {
const Tensor &grad = *gradient;
Tensor &m = *momentums_;
for (size_t i = 0; i < param.size(); ++i) {
if (momentum == 0.0) {
velocity =
-learning_rate * gradient[i] - learning_rate * decay * parameter_[i];
velocity = -learning_rate * grad[i] - learning_rate * decay * param[i];
} else {
momentums_[i] = momentum * momentums_[i] - learning_rate * gradient[i] -
learning_rate * decay * parameter_[i];
velocity = momentums_[i];
m[i] = momentum * m[i] - learning_rate * grad[i] -
learning_rate * decay * param[i];
velocity = m[i];
}
if (nesterov) {
parameter_[i] += momentum * velocity - learning_rate * gradient[i];
param[i] += momentum * velocity - learning_rate * grad[i];
} else {
parameter_[i] += velocity;
param[i] += velocity;
}
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册