From 2c5a6ac09575d66669e207fbc366fd981902cca7 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Sep 2016 15:03:57 +0800 Subject: [PATCH] Optional fields to shrink generated proto size (#93) * remove unnecessary field set in ParameterConfig, Evaluators, etc --- paddle/gserver/layers/CRFLayer.cpp | 2 +- paddle/gserver/layers/CostLayer.cpp | 6 +- paddle/trainer/tests/.gitignore | 1 + proto/ModelConfig.proto.m4 | 2 +- proto/ParameterConfig.proto.m4 | 8 +- python/paddle/trainer/config_parser.py | 106 ++++++++++++------ .../trainer_config_helpers/evaluators.py | 37 +++--- .../trainer_config_helpers/optimizers.py | 2 +- 8 files changed, 99 insertions(+), 65 deletions(-) diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp index fb0a0ddb3d..c1dcad2b5f 100644 --- a/paddle/gserver/layers/CRFLayer.cpp +++ b/paddle/gserver/layers/CRFLayer.cpp @@ -31,7 +31,7 @@ bool CRFLayer::init(const LayerMap& layerMap, } // coeff only affect bp, keep consistent with CostLayer - coeff_ = config_.has_coeff() ? config_.coeff() : real(1.0); + coeff_ = config_.coeff(); if (inputLayers_.size() == 3) { weightLayer_ = inputLayers_[2]; } diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp index 0f99aee032..14ff8510f7 100644 --- a/paddle/gserver/layers/CostLayer.cpp +++ b/paddle/gserver/layers/CostLayer.cpp @@ -26,11 +26,7 @@ namespace paddle { bool CostLayer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) { bool ret = Layer::init(layerMap, parameterMap); - if (config_.has_coeff()) { - coeff_ = config_.coeff(); // coeff only affact bp - } else { - coeff_ = real(1.0); - } + coeff_ = config_.coeff(); if (!ret) return ret; CHECK_GE(inputLayers_.size(), 2UL); CHECK_LE(inputLayers_.size(), 3UL); diff --git a/paddle/trainer/tests/.gitignore b/paddle/trainer/tests/.gitignore index 79f7012036..aedb0ef22e 100644 --- a/paddle/trainer/tests/.gitignore +++ b/paddle/trainer/tests/.gitignore @@ -1,2 +1,3 @@ dump_text.test test_pydata_provider_wrapper.json +*proto.bin diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4 index a2b243a786..b32f8b1ee9 100644 --- a/proto/ModelConfig.proto.m4 +++ b/proto/ModelConfig.proto.m4 @@ -299,7 +299,7 @@ sinclude(`ModelConfigLayer.proto.m4') optional bool norm_by_times = 25; // for CostLayers - optional real coeff = 26; + optional real coeff = 26 [default = 1.0]; // for AverageLayer // can be set to: 'average', 'sum' or 'squarerootn' diff --git a/proto/ParameterConfig.proto.m4 b/proto/ParameterConfig.proto.m4 index 222e070089..e8d512445e 100644 --- a/proto/ParameterConfig.proto.m4 +++ b/proto/ParameterConfig.proto.m4 @@ -31,8 +31,8 @@ message ParameterUpdaterHookConfig { message ParameterConfig { required string name = 1; required uint64 size = 2; - required real learning_rate = 3; - required real momentum = 4; + optional real learning_rate = 3 [default = 1.0]; + optional real momentum = 4 [default = 0.0]; optional real initial_mean = 5 [default = 0.0]; optional real initial_std = 6 [default = 0.01]; // use L2-regularization if decay_rate set and decay_rate_l1 not set @@ -54,8 +54,8 @@ message ParameterConfig { optional int32 num_batches_regularization = 13 [default = 1]; // if is_sparse is true, para is sparse, else para is dense optional bool is_sparse = 14[default = false]; - // if para is sparse, format should be "csc" or "csr" - optional string format = 15[default = "csr"]; + // if para is sparse, format should be "csc" or "csr", empty means is not sparse + optional string format = 15 [default = ""]; // sparse remote update or not optional bool sparse_remote_update = 16 [default = false]; // gradient clipping threshold, no clipping by default diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index c5709208d4..4ce01e005a 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -114,15 +114,15 @@ g_layer_type_map = {} # Initialize global variables. We use this function so that we can # call parse_config() multiple times def init_config_environment( - g_default_momentum = 0., - g_default_decay_rate = 0., + g_default_momentum = None, + g_default_decay_rate = None, g_default_initial_mean = 0., g_default_initial_std = 0.01, - g_default_num_batches_regularization = 1, + g_default_num_batches_regularization = None, g_default_initial_strategy = 0, g_default_initial_smart = False, - g_default_gradient_clipping_threshold = 0., - g_default_device = -1, + g_default_gradient_clipping_threshold = None, + g_default_device = None, g_default_update_hooks = None, g_default_compact_func = None, @@ -1099,12 +1099,12 @@ def Evaluator( inputs, chunk_scheme = None, num_chunk_types = None, - classification_threshold = 0.5, - positive_label = -1, - dict_file = "", - result_file = "", - num_results = 1, - delimited = True, + classification_threshold = None, + positive_label = None, + dict_file = None, + result_file = None, + num_results = None, + delimited = None, ): evaluator = g_config.model_config.evaluators.add() evaluator.type = type @@ -1120,12 +1120,19 @@ def Evaluator( evaluator.num_chunk_types = num_chunk_types g_current_submodel.evaluator_names.append(evaluator.name) - evaluator.classification_threshold = classification_threshold - evaluator.positive_label = positive_label - evaluator.dict_file = dict_file - evaluator.result_file = result_file - evaluator.num_results = num_results - evaluator.delimited = delimited + if classification_threshold is not None: + evaluator.classification_threshold = classification_threshold + if positive_label is not None: + evaluator.positive_label = positive_label + if dict_file is not None: + evaluator.dict_file = dict_file + + if result_file is not None: + evaluator.result_file = result_file + if num_results is not None: + evaluator.num_results = num_results + if delimited is not None: + evaluator.delimited = delimited class LayerBase(object): def __init__( @@ -1137,7 +1144,7 @@ class LayerBase(object): device=None, active_type="", drop_rate=0., - coeff=1.): + coeff=None): config_assert('@' not in name, "layer name: %s contain special character @" % name) global g_current_submodel @@ -1155,10 +1162,12 @@ class LayerBase(object): self.inputs = [self.inputs] self.config = g_config.model_config.layers.add() + assert isinstance(self.config, LayerConfig) self.config.name = name self.config.type = type self.config.active_type = active_type - self.config.coeff = coeff + if coeff is not None: + self.config.coeff = float(coeff) if size != 0: self.config.size = size if drop_rate != 0: @@ -1166,7 +1175,7 @@ class LayerBase(object): if device is not None: self.config.device = device - else: + elif g_default_device is not None: self.config.device = g_default_device for input_index in xrange(len(self.inputs)): @@ -1236,10 +1245,12 @@ class LayerBase(object): if bias.parameter_name is None: bias.parameter_name = gen_bias_parameter_name(self.config.name) if bias.parameter_name not in g_parameter_map: + assert isinstance(self.config, LayerConfig) + Parameter( bias.parameter_name, size, - self.config.device, + self.config.device if self.config.HasField('device') else None, dims, bias.learning_rate, bias.momentum, @@ -1265,7 +1276,7 @@ class LayerBase(object): input_index, size, dims=None, - sparse = False, + sparse = None, format = "csr"): if dims is None: # TODO(yuyang18): print warning and callstack here! @@ -1293,7 +1304,7 @@ class LayerBase(object): Parameter( input_config.parameter_name, size, - self.config.device, + self.config.device if self.config.HasField("device") else None, dims, input_config.learning_rate, input_config.momentum, @@ -1353,6 +1364,8 @@ class FCLayer(LayerBase): if sparse: psize = self.inputs[input_index].nnz + else: + sparse = None self.create_input_parameter(input_index, psize, dims, sparse, format) self.create_bias_parameter(bias, self.config.size) @@ -2836,27 +2849,44 @@ def Parameter( para = g_config.model_config.parameters.add() para.name = name para.size = size - para.device = device - para.dims.extend(dims); - para.learning_rate = default(learning_rate, 1.) - para.momentum = default(momentum, g_default_momentum) + if device is not None: + para.device = int(device) + para.dims.extend(dims) + + if learning_rate is not None: + para.learning_rate = float(learning_rate) + + momentum = default(momentum, g_default_momentum) + if momentum is not None: + para.momentum = float(momentum) + config_assert(not momentum or not decay_rate_l1, "momentum and decay_rate_l1 cannot both be non-zero") - para.decay_rate = default(decay_rate, g_default_decay_rate) + + decay_rate = default(decay_rate, g_default_decay_rate) + if decay_rate is not None: + para.decay_rate = decay_rate + if decay_rate_l1 is not None: para.decay_rate_l1 = decay_rate_l1 para.initial_std = default(initial_std, g_default_initial_std) para.initial_mean = default(initial_mean, g_default_initial_mean) - para.num_batches_regularization = default( + + num_batches_regularization = default( num_batches_regularization, g_default_num_batches_regularization) + if num_batches_regularization is not None: + para.num_batches_regularization = int(num_batches_regularization) + if sparse_remote_update is not None: para.sparse_remote_update = sparse_remote_update if sparse_remote_update: g_config.opt_config.use_sparse_remote_updater = True if sparse_update is not None: para.sparse_update = sparse_update - para.gradient_clipping_threshold = default( - gradient_clipping_threshold, g_default_gradient_clipping_threshold); + gradient_clipping_threshold = default( + gradient_clipping_threshold, g_default_gradient_clipping_threshold) + if gradient_clipping_threshold is not None: + para.gradient_clipping_threshold = gradient_clipping_threshold para.initial_strategy = default(initial_strategy, g_default_initial_strategy) para.initial_smart = default(initial_smart, g_default_initial_smart) if para.initial_smart: @@ -2869,15 +2899,19 @@ def Parameter( para.initial_std = 1. / math.sqrt(para.size) if g_default_compact_func is not None: sparse, format, need_compact = g_default_compact_func(para.name) - para.is_sparse = default(sparse, False) - para.format = default(format, "") - para.need_compact = default(need_compact, False) + + if sparse is not None: + para.is_sparse = sparse + if format is not None: + para.format = format + if need_compact is not None: + para.need_compact = need_compact if is_static is not None: para.is_static = is_static config_assert(not para.sparse_remote_update or not para.is_static, "sparse_remote_update and is_static cannot both be true") - - para.is_shared = default(is_shared, False) + if is_shared is not None: + para.is_shared = is_shared update_hooks = default(update_hooks, g_default_update_hooks) diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py index 985fae9f95..7a00d0b7ec 100644 --- a/python/paddle/trainer_config_helpers/evaluators.py +++ b/python/paddle/trainer_config_helpers/evaluators.py @@ -65,12 +65,12 @@ def evaluator_base( name=None, chunk_scheme=None, num_chunk_types=None, - classification_threshold=0.5, - positive_label=-1, - dict_file="", - result_file="", - num_results=1, - delimited=True): + classification_threshold=None, + positive_label=None, + dict_file=None, + result_file=None, + num_results=None, + delimited=None): """ Evaluator will evaluate the network status while training/testing. @@ -105,9 +105,10 @@ def evaluator_base( :type weight: LayerOutput. """ # inputs type assertions. - assert isinstance(classification_threshold, float) - assert isinstance(positive_label, int) - assert isinstance(num_results, int) + assert classification_threshold is None or isinstance( + classification_threshold, float) + assert positive_label is None or isinstance(positive_label, int) + assert num_results is None or isinstance(num_results, int) if not isinstance(input, list): input = [input] @@ -136,7 +137,7 @@ def classification_error_evaluator( label, name=None, weight=None, - threshold=0.5): + threshold=None): """ Classification Error Evaluator. It will print error rate for classification. @@ -253,7 +254,7 @@ def pnpair_evaluator( def precision_recall_evaluator( input, label, - positive_label=-1, + positive_label=None, weight=None, name=None, ): @@ -494,7 +495,7 @@ def gradient_printer_evaluator( @wrap_name_default() def maxid_printer_evaluator( input, - num_results=1, + num_results=None, name=None, ): """ @@ -518,13 +519,14 @@ def maxid_printer_evaluator( """ evaluator_base(name=name, type="max_id_printer", - input=input) + input=input, + num_results=num_results) @evaluator(EvaluatorAttribute.FOR_PRINT) @wrap_name_default() def maxframe_printer_evaluator( input, - num_results=1, + num_results=None, name=None, ): """ @@ -556,9 +558,9 @@ def maxframe_printer_evaluator( @wrap_name_default() def seqtext_printer_evaluator( input, - dict_file="", - result_file="", - delimited=True, + result_file, + dict_file=None, + delimited=None, name=None, ): """ @@ -616,6 +618,7 @@ def seqtext_printer_evaluator( :param name: Evaluator name. :type name: None|basestring """ + assert isinstance(result_file, basestring) evaluator_base(name=name, type="seq_text_printer", input=input, diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py index ed676ac215..af85f745f6 100644 --- a/python/paddle/trainer_config_helpers/optimizers.py +++ b/python/paddle/trainer_config_helpers/optimizers.py @@ -79,7 +79,7 @@ class MomentumOptimizer(BaseSGDOptimizer): 'learning_method': 'momentum' } - def __init__(self, momentum=1e-3): + def __init__(self, momentum=None): self.momentum = momentum -- GitLab