提交 471573ef 编写于 作者: X xzl

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into poolmaxpool_with_mask

...@@ -54,7 +54,6 @@ void MKLDNNAddtoLayer::reshape( ...@@ -54,7 +54,6 @@ void MKLDNNAddtoLayer::reshape(
ow = iw; ow = iw;
reshapeOutput(oh, ow); reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow); resizeOutput(bs, oc * oh * ow);
printSizeInfo();
} }
void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline, void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
......
...@@ -125,7 +125,6 @@ void MKLDNNBatchNormLayer::reshape( ...@@ -125,7 +125,6 @@ void MKLDNNBatchNormLayer::reshape(
<< "Input channel can not be changed"; << "Input channel can not be changed";
reshapeOutput(oh, ow); reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow); resizeOutput(bs, oc * oh * ow);
printSizeInfo();
} }
void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline, void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
......
...@@ -102,8 +102,6 @@ void MKLDNNConvLayer::reshape( ...@@ -102,8 +102,6 @@ void MKLDNNConvLayer::reshape(
reshapeOutput(oh, ow); reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow); resizeOutput(bs, oc * oh * ow);
printSizeInfo();
} }
void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline, void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
......
...@@ -92,7 +92,7 @@ public: ...@@ -92,7 +92,7 @@ public:
void printSizeInfo() override { void printSizeInfo() override {
MKLDNNLayer::printSizeInfo(); MKLDNNLayer::printSizeInfo();
VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_ VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
<< ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_ << ", ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
<< ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_; << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
} }
......
...@@ -84,8 +84,6 @@ void MKLDNNFcLayer::reshape( ...@@ -84,8 +84,6 @@ void MKLDNNFcLayer::reshape(
reshapeOutput(oh, ow); reshapeOutput(oh, ow);
resizeOutput(bs, oc); resizeOutput(bs, oc);
printSizeInfo();
} }
void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline, void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
......
...@@ -71,8 +71,6 @@ void MKLDNNPoolLayer::reshape( ...@@ -71,8 +71,6 @@ void MKLDNNPoolLayer::reshape(
reshapeOutput(oh, ow); reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow); resizeOutput(bs, oc * oh * ow);
printSizeInfo();
} }
void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline, void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
......
...@@ -27,20 +27,22 @@ namespace platform { ...@@ -27,20 +27,22 @@ namespace platform {
This wrap is a hack to avoid this bug. This wrap is a hack to avoid this bug.
*/ */
template <class Callable, class... Args> template <typename Callable, typename... Args>
inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) { inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
bool good = false; bool good = false;
std::exception ex; std::exception ex;
std::call_once(flag, [&]() { std::call_once(flag,
try { [&](Args&&... args) {
f(args...); try {
good = true; f(args...);
} catch (const std::exception& e) { good = true;
ex = e; } catch (const std::exception& e) {
} catch (...) { ex = e;
ex = std::runtime_error("excption caught in call_once"); } catch (...) {
} ex = std::runtime_error("excption caught in call_once");
}); }
},
args...);
if (!good) { if (!good) {
throw std::exception(ex); throw std::exception(ex);
} }
......
...@@ -4,7 +4,7 @@ import itertools ...@@ -4,7 +4,7 @@ import itertools
from paddle.v2.framework.framework import Variable, g_main_program, \ from paddle.v2.framework.framework import Variable, g_main_program, \
g_startup_program, unique_name, Program g_startup_program, unique_name, Program
from paddle.v2.framework.initializer import ConstantInitializer, \ from paddle.v2.framework.initializer import ConstantInitializer, \
UniformInitializer UniformInitializer, XavierInitializer
class LayerHelper(object): class LayerHelper(object):
...@@ -61,7 +61,7 @@ class LayerHelper(object): ...@@ -61,7 +61,7 @@ class LayerHelper(object):
@property @property
def param_attr(self): def param_attr(self):
default = {'name': None, 'initializer': UniformInitializer()} default = {'name': None, 'initializer': XavierInitializer()}
actual = self.kwargs.get('param_attr', None) actual = self.kwargs.get('param_attr', None)
if actual is None: if actual is None:
actual = default actual = default
...@@ -70,10 +70,11 @@ class LayerHelper(object): ...@@ -70,10 +70,11 @@ class LayerHelper(object):
actual[default_field] = default[default_field] actual[default_field] = default[default_field]
return actual return actual
@property
def bias_attr(self): def bias_attr(self):
default = {'name': None, 'initializer': ConstantInitializer()} default = {'name': None, 'initializer': XavierInitializer()}
bias_attr = self.kwargs.get('bias_attr', None) bias_attr = self.kwargs.get('bias_attr', None)
if bias_attr is True: if bias_attr is None:
bias_attr = default bias_attr = default
if isinstance(bias_attr, dict): if isinstance(bias_attr, dict):
...@@ -166,7 +167,7 @@ class LayerHelper(object): ...@@ -166,7 +167,7 @@ class LayerHelper(object):
num_flatten_dims = 1 num_flatten_dims = 1
size = list(input_var.shape[num_flatten_dims:]) size = list(input_var.shape[num_flatten_dims:])
bias_attr = self.bias_attr() bias_attr = self.bias_attr
if not bias_attr: if not bias_attr:
return input_var return input_var
......
...@@ -16,7 +16,7 @@ __all__ = [ ...@@ -16,7 +16,7 @@ __all__ = [
def fc(input, def fc(input,
size, size,
param_attr=None, param_attr=None,
bias_attr=True, bias_attr=None,
name=None, name=None,
act=None, act=None,
num_flatten_dims=1, num_flatten_dims=1,
...@@ -125,6 +125,55 @@ def embedding(input, ...@@ -125,6 +125,55 @@ def embedding(input,
return tmp return tmp
# TODO(qijun): expose H0 and C0
def dynamic_lstm(input,
size,
data_type='float32',
param_attr=None,
bias_attr=None,
use_peepholes=True,
is_reverse=False,
gate_activation='sigmoid',
cell_activation='tanh',
candidate_activation='tanh',
main_program=None,
startup_program=None):
helper = LayerHelper('lstm', **locals())
size = size / 4
weight = helper.create_parameter(
attr=helper.param_attr, shape=[size, 4 * size], dtype=data_type)
bias_size = [1, 7 * size]
if not use_peepholes:
bias_size[1] = 4 * size
bias = helper.create_parameter(
attr=helper.bias_attr, shape=bias_size, dtype=data_type, suffix='b')
hidden = helper.create_tmp_variable(data_type)
cell = helper.create_tmp_variable(data_type)
batch_gate = helper.create_tmp_variable(data_type)
batch_cell_pre_act = helper.create_tmp_variable(data_type)
helper.append_op(
type='lstm',
inputs={'Input': input,
'Weight': weight,
'Bias': bias},
outputs={
'Hidden': hidden,
'Cell': cell,
'BatchGate': batch_gate,
'BatchCellPreAct': batch_cell_pre_act
},
attrs={
'use_peepholes': use_peepholes,
'is_reverse': is_reverse,
'gate_activation': gate_activation,
'cell_activation': cell_activation,
'candidate_activation': candidate_activation
})
return hidden, cell
def data(name, def data(name,
shape, shape,
data_type='float32', data_type='float32',
......
...@@ -35,15 +35,21 @@ class Optimizer(object): ...@@ -35,15 +35,21 @@ class Optimizer(object):
""" """
raise NotImplementedError() raise NotImplementedError()
def _initialize_tensors(self, block): def _create_param_lr(self, param_and_grad):
"""Create all necessary tensors, that will be shared for all parameter updates. # create learning rate variable for every parameter
param = param_and_grad[0]
Tensors like learning rate should be initialized here. param_lr = param.optimize_attr['learning_rate']
param_lr_shape = [1]
Args: param_lr_var = self.helper.create_global_variable(
block: the block in which the loss variable is present name=unique_name("learning_rate"),
""" dtype='float32',
pass shape=param_lr_shape,
lod_level=1,
persistable=True)
param_lr = param_lr * self._learning_rate
self.helper.set_variable_initializer(
var=param_lr_var, initializer=ConstantInitializer(param_lr))
return param_lr_var
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
"""Create all accumulators needed by the parameters """Create all accumulators needed by the parameters
...@@ -161,8 +167,6 @@ class Optimizer(object): ...@@ -161,8 +167,6 @@ class Optimizer(object):
startup_program=startup_program) startup_program=startup_program)
self._create_accumulators(loss.block, self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) [p[0] for p in parameters_and_grads])
# Create any necessary tensors
self._initialize_tensors(loss.block)
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
...@@ -214,27 +218,16 @@ class SGDOptimizer(Optimizer): ...@@ -214,27 +218,16 @@ class SGDOptimizer(Optimizer):
self.type = "sgd" self.type = "sgd"
self._learning_rate = learning_rate self._learning_rate = learning_rate
def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
# create the optimize op # create the optimize op
sgd_op = block.append_op( sgd_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs={
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"LearningRate": self._lr "LearningRate": self._create_param_lr(param_and_grad)
}, },
outputs={"ParamOut": param_and_grad[0]}) outputs={"ParamOut": param_and_grad[0]})
...@@ -259,19 +252,6 @@ class MomentumOptimizer(Optimizer): ...@@ -259,19 +252,6 @@ class MomentumOptimizer(Optimizer):
self._momentum = momentum self._momentum = momentum
self._use_nesterov = bool(use_nesterov) self._use_nesterov = bool(use_nesterov)
def _initialize_tensors(self, block):
assert isinstance(block, framework.Block)
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -290,7 +270,7 @@ class MomentumOptimizer(Optimizer): ...@@ -290,7 +270,7 @@ class MomentumOptimizer(Optimizer):
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"Velocity": velocity_acc, "Velocity": velocity_acc,
"LearningRate": self._lr "LearningRate": self._create_param_lr(param_and_grad)
}, },
outputs={ outputs={
"ParamOut": param_and_grad[0], "ParamOut": param_and_grad[0],
...@@ -315,18 +295,6 @@ class AdagradOptimizer(Optimizer): ...@@ -315,18 +295,6 @@ class AdagradOptimizer(Optimizer):
self._learning_rate = learning_rate self._learning_rate = learning_rate
self._epsilon = epsilon self._epsilon = epsilon
def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -346,7 +314,7 @@ class AdagradOptimizer(Optimizer): ...@@ -346,7 +314,7 @@ class AdagradOptimizer(Optimizer):
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"Moment": moment_acc, "Moment": moment_acc,
"LearningRate": self._lr "LearningRate": self._create_param_lr(param_and_grad)
}, },
outputs={"ParamOut": param_and_grad[0], outputs={"ParamOut": param_and_grad[0],
"MomentOut": moment_acc}, "MomentOut": moment_acc},
...@@ -378,18 +346,6 @@ class AdamOptimizer(Optimizer): ...@@ -378,18 +346,6 @@ class AdamOptimizer(Optimizer):
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -433,7 +389,7 @@ class AdamOptimizer(Optimizer): ...@@ -433,7 +389,7 @@ class AdamOptimizer(Optimizer):
inputs={ inputs={
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"LearningRate": self._lr, "LearningRate": self._create_param_lr(param_and_grad),
"Moment1": moment1, "Moment1": moment1,
"Moment2": moment2, "Moment2": moment2,
"Beta1Pow": self._beta1_pow_acc, "Beta1Pow": self._beta1_pow_acc,
...@@ -495,18 +451,6 @@ class AdamaxOptimizer(Optimizer): ...@@ -495,18 +451,6 @@ class AdamaxOptimizer(Optimizer):
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
def _initialize_tensors(self, block):
lr_shape = [1]
# create a variable for learning_rate
self._lr = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=lr_shape,
lod_level=1,
persistable=True)
self.helper.set_variable_initializer(
var=self._lr, initializer=ConstantInitializer(self._learning_rate))
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
# Create beta1 power accumulator tensor # Create beta1 power accumulator tensor
beta_shape = [1] beta_shape = [1]
...@@ -536,7 +480,7 @@ class AdamaxOptimizer(Optimizer): ...@@ -536,7 +480,7 @@ class AdamaxOptimizer(Optimizer):
inputs={ inputs={
"Param": param_and_grad[0], "Param": param_and_grad[0],
"Grad": param_and_grad[1], "Grad": param_and_grad[1],
"LearningRate": self._lr, "LearningRate": self._create_param_lr(param_and_grad),
"Moment": moment, "Moment": moment,
"InfNorm": inf_norm, "InfNorm": inf_norm,
"Beta1Pow": self._beta1_pow_acc "Beta1Pow": self._beta1_pow_acc
......
import paddle.v2 as paddle
import paddle.v2.framework.layers as layers
import paddle.v2.framework.nets as nets
import paddle.v2.framework.core as core
import paddle.v2.framework.optimizer as optimizer
from paddle.v2.framework.framework import Program, g_main_program, g_startup_program
from paddle.v2.framework.executor import Executor
import numpy as np
def stacked_lstm_net(input_dim,
class_dim=2,
emb_dim=128,
hid_dim=512,
stacked_num=3):
assert stacked_num % 2 == 1
data = layers.data(name="words", shape=[1], data_type="int64")
label = layers.data(name="label", shape=[1], data_type="int64")
emb = layers.embedding(input=data, size=[input_dim, emb_dim])
# add bias attr
# TODO(qijun) linear act
fc1 = layers.fc(input=emb, size=hid_dim)
lstm1, cell1 = layers.dynamic_lstm(input=fc1, size=hid_dim)
inputs = [fc1, lstm1]
for i in range(2, stacked_num + 1):
fc = layers.fc(input=inputs, size=hid_dim)
lstm, cell = layers.dynamic_lstm(
input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
inputs = [fc, lstm]
fc_last = layers.sequence_pool(input=inputs[0], pool_type='max')
lstm_last = layers.sequence_pool(input=inputs[1], pool_type='max')
prediction = layers.fc(input=[fc_last, lstm_last],
size=class_dim,
act='softmax')
cost = layers.cross_entropy(input=prediction, label=label)
avg_cost = layers.mean(x=cost)
adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
opts = adam_optimizer.minimize(avg_cost)
acc = layers.accuracy(input=prediction, label=label)
return avg_cost, acc
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def main():
BATCH_SIZE = 100
PASS_NUM = 5
word_dict = paddle.dataset.imdb.word_dict()
print "load word dict successfully"
dict_dim = len(word_dict)
class_dim = 2
cost, acc = stacked_lstm_net(input_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=1000),
batch_size=BATCH_SIZE)
place = core.CPUPlace()
exe = Executor(place)
exe.run(g_startup_program)
for pass_id in xrange(PASS_NUM):
for data in train_data():
tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
label = np.array(map(lambda x: x[1], data)).astype("int64")
label = label.reshape([BATCH_SIZE, 1])
tensor_label = core.LoDTensor()
tensor_label.set(label, place)
outs = exe.run(g_main_program,
feed={"words": tensor_words,
"label": tensor_label},
fetch_list=[cost, acc])
cost_val = np.array(outs[0])
acc_val = np.array(outs[1])
print("cost=" + str(cost_val) + " acc=" + str(acc_val))
if cost_val < 1.0 and acc_val > 0.7:
exit(0)
exit(1)
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册