未验证 提交 1b6dcc2f 编写于 作者: Y Yu Yang 提交者: GitHub

Feature/param attr (#5996)

* Make param_attr as a strong typed class

Fix #5819
上级 399d3a2d
......@@ -13,13 +13,14 @@ import nets
import optimizer
import backward
import regularizer
from param_attr import ParamAttr
from core import LoDTensor, CPUPlace, GPUPlace
Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + [
'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor'
'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
]
......
import copy
import itertools
from framework import Variable, default_main_program, default_startup_program, unique_name, dtype_is_floating
from framework import Variable, default_main_program, default_startup_program, \
unique_name, dtype_is_floating
from paddle.v2.fluid.initializer import Constant, Xavier
from param_attr import ParamAttr
class LayerHelper(object):
......@@ -59,31 +61,15 @@ class LayerHelper(object):
@property
def param_attr(self):
default = {'name': None}
actual = self.kwargs.get('param_attr', None)
if actual is None:
actual = default
for default_field in default.keys():
if default_field not in actual:
actual[default_field] = default[default_field]
return actual
return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
@property
def bias_attr(self):
default = {'name': None}
bias_attr = self.kwargs.get('bias_attr', None)
if bias_attr is None:
bias_attr = default
if isinstance(bias_attr, dict):
for default_field in default.keys():
if default_field not in bias_attr:
bias_attr[default_field] = default[default_field]
return bias_attr
return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
def multiple_param_attr(self, length):
param_attr = self.param_attr
if isinstance(param_attr, dict):
if isinstance(param_attr, ParamAttr):
param_attr = [param_attr]
if len(param_attr) != 1 and len(param_attr) != length:
......@@ -111,23 +97,30 @@ class LayerHelper(object):
raise ValueError("Data Type mismatch")
return dtype
def create_parameter(self, attr, shape, dtype, suffix='w',
initializer=None):
def create_parameter(self,
attr,
shape,
dtype,
is_bias=False,
default_initializer=None):
# Deepcopy the attr so that parameters can be shared in program
attr_copy = copy.deepcopy(attr)
if initializer is not None:
attr_copy['initializer'] = initializer
assert isinstance(attr, ParamAttr)
suffix = 'b' if is_bias else 'w'
if default_initializer is None:
if is_bias:
attr.set_default_bias_initializer()
else:
attr.set_default_param_initializer()
else:
attr_copy['initializer'] = self._get_default_initializer(dtype)
if attr_copy['name'] is None:
attr_copy['name'] = unique_name(".".join([self.name, suffix]))
attr.set_default_initializer(default_initializer)
if attr.name is None:
attr.name = unique_name(".".join([self.name, suffix]))
self.startup_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr_copy)
dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
return self.main_program.global_block().create_parameter(
name=attr_copy['name'],
dtype=dtype,
shape=shape,
trainable=attr_copy.get('trainable', True))
dtype=dtype, shape=shape, **attr.to_kwargs())
def create_tmp_variable(self, dtype):
return self.main_program.current_block().create_var(
......@@ -152,11 +145,7 @@ class LayerHelper(object):
persistable=True,
initializer=initializer)
def append_bias_op(self,
input_var,
bias_initializer,
dim_start=1,
dim_end=None):
def append_bias_op(self, input_var, dim_start=1, dim_end=None):
"""
Append bias operator and return its output. If the user does not set
bias_attr, append_bias_op will return input_var
......@@ -176,11 +165,7 @@ class LayerHelper(object):
return input_var
b = self.create_parameter(
attr=bias_attr,
shape=size,
dtype=input_var.dtype,
suffix='b',
initializer=bias_initializer)
attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
tmp = self.create_tmp_variable(dtype=input_var.dtype)
self.append_op(
type='elementwise_add',
......
......@@ -5,6 +5,7 @@ from initializer import Constant, Normal, Xavier, Initializer
from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
import re
import cStringIO
from param_attr import ParamAttr
__all__ = [
'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
......@@ -17,9 +18,7 @@ def fc(input,
size,
num_flatten_dims=1,
param_attr=None,
param_initializer=None,
bias_attr=None,
bias_initializer=None,
act=None,
name=None,
main_program=None,
......@@ -54,23 +53,10 @@ def fc(input,
to the LayerHelper constructor.
"""
def _get_default_param_initializer():
return Xavier()
def _get_default_bias_initializer():
return Constant()
helper = LayerHelper('fc', **locals())
dtype = helper.input_dtype()
if param_initializer is None:
param_initializer = _get_default_param_initializer()
if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()
mul_results = []
for input_var, param_attr in helper.iter_inputs_and_params():
input_shape = input_var.shape
......@@ -78,10 +64,7 @@ def fc(input,
reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
] + [size]
w = helper.create_parameter(
attr=param_attr,
initializer=param_initializer,
shape=param_shape,
dtype=dtype)
attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
tmp = helper.create_tmp_variable(dtype)
helper.append_op(
type="mul",
......@@ -102,7 +85,7 @@ def fc(input,
helper.append_op(
type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
# add bias
pre_activation = helper.append_bias_op(pre_bias, bias_initializer)
pre_activation = helper.append_bias_op(pre_bias)
# add activation
return helper.append_activation(pre_activation)
......@@ -110,7 +93,6 @@ def fc(input,
def embedding(input,
size,
is_sparse=False,
param_initializer=None,
param_attr=None,
dtype='float32',
main_program=None,
......@@ -119,6 +101,7 @@ def embedding(input,
Embedding Layer.
Args:
param_initializer:
input: The input to the function
size: The size of the layer
is_sparse: A flag that decleares whether the input is sparse
......@@ -136,15 +119,9 @@ def embedding(input,
"""
def _get_default_param_initializer():
return Xavier()
helper = LayerHelper('embedding', **locals())
w = helper.create_parameter(
attr=helper.param_attr,
shape=size,
dtype=dtype,
initializer=param_initializer or _get_default_param_initializer())
attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
tmp = helper.create_tmp_variable(dtype)
helper.append_op(
type='lookup_table',
......@@ -176,7 +153,7 @@ def dynamic_lstm(input,
if not use_peepholes:
bias_size[1] = 4 * size
bias = helper.create_parameter(
attr=helper.bias_attr, shape=bias_size, dtype=dtype, suffix='b')
attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
hidden = helper.create_tmp_variable(dtype)
cell = helper.create_tmp_variable(dtype)
......@@ -471,19 +448,14 @@ def sums(input, out=None, main_program=None, startup_program=None):
def linear_chain_crf(input,
label,
param_attr=None,
param_initializer=None,
main_program=None,
startup_program=None):
def _get_default_param_initializer():
return Xavier()
helper = LayerHelper('linear_chain_crf', **locals())
size = input.shape[1]
transition = helper.create_parameter(
attr=helper.param_attr,
shape=[size + 2, size],
dtype=helper.input_dtype(),
initializer=param_initializer or _get_default_param_initializer())
dtype=helper.input_dtype())
alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
......@@ -646,9 +618,7 @@ def sequence_conv(input,
filter_stride=1,
padding=None,
bias_attr=None,
bias_initializer=None,
param_attr=None,
param_initializer=None,
act=None,
main_program=None,
startup_program=None):
......@@ -658,30 +628,15 @@ def sequence_conv(input,
in the input parameters to the function.
"""
def _get_default_bias_initializer():
return Constant()
def _get_default_param_initializer():
return Xavier()
# FIXME(dzh) : want to unify the argument of python layer
# function. So we ignore some unecessary attributes.
# such as, padding_trainable, context_start.
helper = LayerHelper('sequence_conv', **locals())
dtype = helper.input_dtype()
if param_initializer is None:
param_initializer = _get_default_param_initializer()
if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()
filter_shape = [filter_size * input.shape[1], num_filters]
filter = helper.create_parameter(
attr=helper.param_attr,
shape=filter_shape,
dtype=dtype,
initializer=param_initializer)
attr=helper.param_attr, shape=filter_shape, dtype=dtype)
pre_bias = helper.create_tmp_variable(dtype)
helper.append_op(
......@@ -696,7 +651,7 @@ def sequence_conv(input,
'contextStart': -int(filter_size / 2),
'contextLength': filter_size
})
pre_act = helper.append_bias_op(pre_bias, bias_initializer)
pre_act = helper.append_bias_op(pre_bias)
return helper.append_activation(pre_act)
......@@ -707,9 +662,7 @@ def conv2d(input,
padding=None,
groups=None,
param_attr=None,
param_initializer=None,
bias_attr=None,
bias_initializer=None,
act=None,
name=None,
main_program=None,
......@@ -722,13 +675,6 @@ def conv2d(input,
conv-2d output, if mentioned in the input parameters.
"""
def _get_default_bias_initializer():
return Constant()
def _get_default_param_initializer(filter_size, num_channels):
std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
return Normal(0.0, std, 0)
helper = LayerHelper('conv2d', **locals())
dtype = helper.input_dtype()
......@@ -750,17 +696,16 @@ def conv2d(input,
input_shape = input.shape
filter_shape = [num_filters, num_filter_channels] + filter_size
if param_initializer is None:
param_initializer = _get_default_param_initializer(filter_size,
num_channels)
if bias_initializer is None:
bias_initializer = _get_default_bias_initializer()
def _get_default_param_initializer():
std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
return Normal(0.0, std, 0)
filter = helper.create_parameter(
attr=helper.param_attr,
shape=filter_shape,
dtype=dtype,
initializer=param_initializer)
default_initializer=_get_default_param_initializer())
pre_bias = helper.create_tmp_variable(dtype)
helper.append_op(
......@@ -774,8 +719,7 @@ def conv2d(input,
'paddings': padding,
'groups': groups})
pre_act = helper.append_bias_op(
pre_bias, bias_initializer, dim_start=1, dim_end=2)
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
return helper.append_activation(pre_act)
......@@ -876,12 +820,10 @@ def batch_norm(input,
attr=helper.param_attr,
shape=param_shape,
dtype=dtype,
initializer=Constant(1.0))
default_initializer=Constant(1.0))
bias = helper.create_parameter(
attr=helper.param_attr,
shape=param_shape,
dtype=dtype,
initializer=Constant(0.0))
attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
mean = helper.create_global_variable(
dtype=input.dtype, shape=param_shape, persistable=True)
......@@ -1356,7 +1298,7 @@ def lod_rank_table(x, level=0, main_program=None):
def max_sequence_len(rank_table, main_program=None):
"""
This function creates an operator to calculate the length of
This function creates an operator to calculate the length of
max seqence through input rank_table(should be a lod_rank_table)
"""
helper = LayerHelper("max_seqence_len", **locals())
......@@ -1594,35 +1536,33 @@ def conv2d_transpose(input,
padding=None,
stride=None,
param_attr=None,
param_initializer=None,
main_program=None,
startup_program=None):
"""
The transpose of conv2d layer.
This layer is also known as deconvolution layer.
Args:
input(Variable): The input image with [N, C, H, W] format.
num_filters(int): The number of filter. It is as same as the output
image channel.
output_size(int|tuple|None): The output image size. If output size is a
tuple, it must contain two integers, (image_H, image_W). This
tuple, it must contain two integers, (image_H, image_W). This
parameter only works when filter_size is None.
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
it must contain two integers, (filter_size_H, filter_size_W).
Otherwise, the filter will be a square. None if use output size to
calculate filter_size
padding(int|tuple): The padding size. If padding is a tuple, it must
contain two integers, (padding_H, padding_W). Otherwise, the
contain two integers, (padding_H, padding_W). Otherwise, the
padding_H = padding_W = padding.
stride(int|tuple): The stride size. If stride is a tuple, it must
contain two integers, (stride_H, stride_W). Otherwise, the
stride_H = stride_W = stride.
param_attr: Parameter Attribute.
param_initializer(Initializer): Parameter Initializer. Default is Xavier
main_program(Program): the main program
startup_program(Program): the startup program
startup_program(Program): the startup program
Returns:
Variable: Output image.
......@@ -1663,10 +1603,7 @@ def conv2d_transpose(input,
filter_shape = [input_channel, num_filters] + filter_size
img_filter = helper.create_parameter(
dtype=input.dtype,
shape=filter_shape,
attr=helper.param_attr,
initializer=param_initializer)
dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
out = helper.create_tmp_variable(dtype=input.dtype)
helper.append_op(
......@@ -1675,6 +1612,7 @@ def conv2d_transpose(input,
'Filter': [img_filter]},
outputs={'Output': out},
attrs=op_attr)
return out
......
from initializer import Initializer, Xavier, Constant
from regularizer import WeightDecayRegularizer
class ParamAttr(object):
def __init__(self,
name=None,
initializer=None,
learning_rate=1.0,
regularizer=None,
trainable=True):
self.name = name
self.initializer = initializer
self.learning_rate = learning_rate
self.regularizer = regularizer
self.trainable = trainable
def set_default_initializer(self, initializer):
if initializer is None:
if self.initializer is None:
raise ValueError("ParamAttr.initializer is not set")
return
if self.initializer is not None:
return
self.initializer = initializer
def set_default_param_initializer(self):
self.set_default_initializer(Xavier())
def set_default_bias_initializer(self):
self.set_default_initializer(Constant(0.0))
@staticmethod
def to_attr(arg):
if arg is None:
return ParamAttr()
elif isinstance(arg, ParamAttr):
return arg
elif isinstance(arg, str) or isinstance(arg, unicode):
return ParamAttr(name=arg)
elif isinstance(arg, Initializer):
return ParamAttr(initializer=arg)
elif isinstance(arg, WeightDecayRegularizer):
return ParamAttr(regularizer=arg)
elif isinstance(arg, bool):
return ParamAttr.to_attr(None) if arg else False
else:
raise TypeError("{0} cast to ParamAttr".format(type(arg)))
def to_kwargs(self, with_initializer=False):
kwargs = {
'name': self.name,
'learning_rate': self.learning_rate,
'regularizer': self.regularizer,
'trainable': self.trainable
}
if with_initializer:
kwargs['initializer'] = self.initializer
return kwargs
......@@ -44,7 +44,7 @@ def db_lstm():
size=[pred_len, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr={'name': 'vemb'})
param_attr='vemb')
mark_embedding = fluid.layers.embedding(
input=mark,
......@@ -57,8 +57,8 @@ def db_lstm():
fluid.layers.embedding(
size=[word_dict_len, word_dim],
input=x,
param_attr={'name': embedding_name,
'trainable': False}) for x in word_input
param_attr=fluid.ParamAttr(
name=embedding_name, trainable=False)) for x in word_input
]
emb_layers.append(predicate_embedding)
emb_layers.append(mark_embedding)
......@@ -125,8 +125,8 @@ def main():
crf_cost = fluid.layers.linear_chain_crf(
input=feature_out,
label=target,
param_attr={"name": 'crfw',
"learning_rate": mix_hidden_lr})
param_attr=fluid.ParamAttr(
name='crfw', learning_rate=mix_hidden_lr))
avg_cost = fluid.layers.mean(x=crf_cost)
# TODO(qiao)
# 1. add crf_decode_layer and evaluator
......
......@@ -6,24 +6,21 @@ import paddle.v2.fluid as fluid
BATCH_SIZE = 128
image = fluid.layers.data(name='x', shape=[784], dtype='float32')
param_attr = {
'name': None,
'regularization': fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
}
regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
hidden1 = fluid.layers.fc(input=image,
size=128,
act='relu',
param_attr=param_attr)
param_attr=regularizer)
hidden2 = fluid.layers.fc(input=hidden1,
size=64,
act='relu',
param_attr=param_attr)
param_attr=regularizer)
predict = fluid.layers.fc(input=hidden2,
size=10,
act='softmax',
param_attr=param_attr)
param_attr=regularizer)
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
......
......@@ -24,7 +24,7 @@ def get_usr_combined_features():
input=uid,
dtype='float32',
size=[USR_DICT_SIZE, 32],
param_attr={'name': 'user_table'},
param_attr='user_table',
is_sparse=IS_SPARSE)
usr_fc = layers.fc(input=usr_emb, size=32)
......@@ -36,7 +36,7 @@ def get_usr_combined_features():
usr_gender_emb = layers.embedding(
input=usr_gender_id,
size=[USR_GENDER_DICT_SIZE, 16],
param_attr={'name': 'gender_table'},
param_attr='gender_table',
is_sparse=IS_SPARSE)
usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
......@@ -48,7 +48,7 @@ def get_usr_combined_features():
input=usr_age_id,
size=[USR_AGE_DICT_SIZE, 16],
is_sparse=IS_SPARSE,
param_attr={'name': 'age_table'})
param_attr='age_table')
usr_age_fc = layers.fc(input=usr_age_emb, size=16)
......@@ -58,7 +58,7 @@ def get_usr_combined_features():
usr_job_emb = layers.embedding(
input=usr_job_id,
size=[USR_JOB_DICT_SIZE, 16],
param_attr={'name': 'job_table'},
param_attr='job_table',
is_sparse=IS_SPARSE)
usr_job_fc = layers.fc(input=usr_job_emb, size=16)
......@@ -81,7 +81,7 @@ def get_mov_combined_features():
input=mov_id,
dtype='float32',
size=[MOV_DICT_SIZE, 32],
param_attr={'name': 'movie_table'},
param_attr='movie_table',
is_sparse=IS_SPARSE)
mov_fc = layers.fc(input=mov_emb, size=32)
......
......@@ -23,25 +23,25 @@ embed_first = fluid.layers.embedding(
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'})
param_attr='shared_w')
embed_second = fluid.layers.embedding(
input=second_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'})
param_attr='shared_w')
embed_third = fluid.layers.embedding(
input=third_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'})
param_attr='shared_w')
embed_forth = fluid.layers.embedding(
input=forth_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr={'name': 'shared_w'})
param_attr='shared_w')
concat_embed = fluid.layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
......
......@@ -132,26 +132,26 @@ class TestBook(unittest.TestCase):
input=first_word,
size=[dict_size, embed_size],
dtype='float32',
param_attr={'name': 'shared_w'},
param_attr='shared_w',
main_program=program)
embed_second = layers.embedding(
input=second_word,
size=[dict_size, embed_size],
dtype='float32',
param_attr={'name': 'shared_w'},
param_attr='shared_w',
main_program=program)
embed_third = layers.embedding(
input=third_word,
size=[dict_size, embed_size],
dtype='float32',
param_attr={'name': 'shared_w'},
param_attr='shared_w',
main_program=program)
embed_forth = layers.embedding(
input=forth_word,
size=[dict_size, embed_size],
dtype='float32',
param_attr={'name': 'shared_w'},
param_attr='shared_w',
main_program=program)
concat_embed = layers.concat(
......
......@@ -271,12 +271,12 @@ class RecurrentOpTest2(RecurrentOpTest1):
temp_l = layers.fc(input=x_t,
size=self.input_dim,
param_attr={'name': 'W'},
param_attr='W',
bias_attr=False,
**self.p_info)
temp_r = layers.fc(input=h_pre,
size=self.input_dim,
param_attr={'name': 'U'},
param_attr='U',
bias_attr=False,
**self.p_info)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册