From d6944dec16933a48396e4dc03e13a365cba60fb8 Mon Sep 17 00:00:00 2001 From: emailweixu Date: Tue, 25 Oct 2016 18:40:05 -0700 Subject: [PATCH] Sequence tagging demo (#225) --- demo/sequence_tagging/data/get_data.sh | 21 ++ demo/sequence_tagging/data/test.list | 1 + demo/sequence_tagging/data/train.list | 1 + demo/sequence_tagging/dataprovider.py | 258 ++++++++++++++++++ demo/sequence_tagging/linear_crf.py | 84 ++++++ demo/sequence_tagging/readme.md | 45 +++ demo/sequence_tagging/rnn_crf.py | 130 +++++++++ demo/sequence_tagging/train.sh | 10 + demo/sequence_tagging/train_linear.sh | 9 + .../trainer_config_helpers/optimizers.py | 15 +- 10 files changed, 572 insertions(+), 2 deletions(-) create mode 100755 demo/sequence_tagging/data/get_data.sh create mode 100644 demo/sequence_tagging/data/test.list create mode 100644 demo/sequence_tagging/data/train.list create mode 100644 demo/sequence_tagging/dataprovider.py create mode 100644 demo/sequence_tagging/linear_crf.py create mode 100644 demo/sequence_tagging/readme.md create mode 100644 demo/sequence_tagging/rnn_crf.py create mode 100755 demo/sequence_tagging/train.sh create mode 100755 demo/sequence_tagging/train_linear.sh diff --git a/demo/sequence_tagging/data/get_data.sh b/demo/sequence_tagging/data/get_data.sh new file mode 100755 index 00000000000..e579d6c46ce --- /dev/null +++ b/demo/sequence_tagging/data/get_data.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +DIR="$( cd "$(dirname "$0")" ; pwd -P )" +cd $DIR + +wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz +wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz diff --git a/demo/sequence_tagging/data/test.list b/demo/sequence_tagging/data/test.list new file mode 100644 index 00000000000..073c0a0c906 --- /dev/null +++ b/demo/sequence_tagging/data/test.list @@ -0,0 +1 @@ +data/test.txt.gz diff --git a/demo/sequence_tagging/data/train.list b/demo/sequence_tagging/data/train.list new file mode 100644 index 00000000000..43c24d5f648 --- /dev/null +++ b/demo/sequence_tagging/data/train.list @@ -0,0 +1 @@ +data/train.txt.gz diff --git a/demo/sequence_tagging/dataprovider.py b/demo/sequence_tagging/dataprovider.py new file mode 100644 index 00000000000..6f412d6834b --- /dev/null +++ b/demo/sequence_tagging/dataprovider.py @@ -0,0 +1,258 @@ +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer.PyDataProvider2 import * +import gzip +import logging + +logging.basicConfig( + format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', +) +logger = logging.getLogger('paddle') +logger.setLevel(logging.INFO) + +OOV_POLICY_IGNORE = 0 +OOV_POLICY_USE = 1 +OOV_POLICY_ERROR = 2 + +num_original_columns = 3 + +# Feature combination patterns. +# [[-1,0], [0,0]] means previous token at column 0 and current token at +# column 0 are combined as one feature. +patterns = [ + [[-2,0]], + [[-1,0]], + [[0,0]], + [[1,0]], + [[2,0]], + + [[-1,0], [0,0]], + [[0,0], [1,0]], + + [[-2,1]], + [[-1,1]], + [[0,1]], + [[1,1]], + [[2,1]], + [[-2,1], [-1,1]], + [[-1,1], [0,1]], + [[0,1], [1,1]], + [[1,1], [2,1]], + + [[-2,1], [-1,1], [0,1]], + [[-1,1], [0,1], [1,1]], + [[0,1], [1,1], [2,1]], +] + +dict_label = { + 'B-ADJP': 0, + 'I-ADJP': 1, + 'B-ADVP': 2, + 'I-ADVP': 3, + 'B-CONJP': 4, + 'I-CONJP': 5, + 'B-INTJ': 6, + 'I-INTJ': 7, + 'B-LST': 8, + 'I-LST': 9, + 'B-NP': 10, + 'I-NP': 11, + 'B-PP': 12, + 'I-PP': 13, + 'B-PRT': 14, + 'I-PRT': 15, + 'B-SBAR': 16, + 'I-SBAR': 17, + 'B-UCP': 18, + 'I-UCP': 19, + 'B-VP': 20, + 'I-VP': 21, + 'O': 22 +} + +def make_features(sequence): + length = len(sequence) + num_features = len(sequence[0]) + def get_features(pos): + if pos < 0: + return ['#B%s' % -pos] * num_features + if pos >= length: + return ['#E%s' % (pos - length + 1)] * num_features + return sequence[pos] + + for i in xrange(length): + for pattern in patterns: + fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern]) + sequence[i].append(fname) + +''' +Source file format: +Each line is for one timestep. The features are separated by space. +An empty line indicates end of a sequence. + +cutoff: a list of numbers. If count of a feature is smaller than this, + it will be ignored. +if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of +i-th column. + +return a list of dict for each column +''' +def create_dictionaries(filename, cutoff, oov_policy): + def add_to_dict(sequence, dicts): + num_features = len(dicts) + for features in sequence: + l = len(features) + assert l == num_features, "Wrong number of features " + line + for i in xrange(l): + if features[i] in dicts[i]: + dicts[i][features[i]] += 1 + else: + dicts[i][features[i]] = 1 + + num_features = len(cutoff) + dicts = [] + for i in xrange(num_features): + dicts.append(dict()) + + f = gzip.open(filename, 'rb') + + sequence = [] + + for line in f: + line = line.strip() + if not line: + make_features(sequence) + add_to_dict(sequence, dicts) + sequence = [] + continue + features = line.split(' ') + sequence.append(features) + + + for i in xrange(num_features): + dct = dicts[i] + n = 1 if oov_policy[i] == OOV_POLICY_USE else 0 + todo = [] + for k, v in dct.iteritems(): + if v < cutoff[i]: + todo.append(k) + else: + dct[k] = n + n += 1 + + if oov_policy[i] == OOV_POLICY_USE: + # placeholder so that len(dct) will be the number of features + # including OOV + dct['#OOV#'] = 0 + + logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo))) + for k in todo: + del dct[k] + + f.close() + return dicts + + +def initializer(settings, **xargs): + cutoff = [3, 1, 0] + cutoff += [3] * len(patterns) + oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR] + oov_policy += [OOV_POLICY_IGNORE] * len(patterns) + dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy) + dicts[2] = dict_label + settings.dicts = dicts + settings.oov_policy = oov_policy + input_types = [] + num_features = len(dicts) + for i in xrange(num_original_columns): + input_types.append(integer_sequence(len(dicts[i]))) + logger.info("slot %s size=%s" % (i, len(dicts[i]))) + if patterns: + dim = 0 + for i in xrange(num_original_columns, num_features): + dim += len(dicts[i]) + input_types.append(sparse_binary_vector_sequence(dim)) + logger.info("feature size=%s" % dim) + settings.input_types = input_types + +''' +if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not +existed in dicts[i] will be assigned to id 0. +if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist +in dicts[i]. +''' +@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM) +def process(settings, filename): + input_file = filename + dicts = settings.dicts + oov_policy = settings.oov_policy + + def gen_sample(sequence): + num_features = len(dicts) + sample = [list() for i in xrange(num_original_columns)] + if patterns: + sample.append([]) + for features in sequence: + assert len(features) == num_features, \ + "Wrong number of features: " + line + for i in xrange(num_original_columns): + id = dicts[i].get(features[i], -1) + if id != -1: + sample[i].append(id) + elif oov_policy[i] == OOV_POLICY_IGNORE: + sample[i].append(0xffffffff) + elif oov_policy[i] == OOV_POLICY_ERROR: + logger.fatal("Unknown token: %s" % features[i]) + else: + sample[i].append(0) + + if patterns: + dim = 0 + vec = [] + for i in xrange(num_original_columns, num_features): + id = dicts[i].get(features[i], -1) + if id != -1: + vec.append(dim + id) + elif oov_policy[i] == OOV_POLICY_IGNORE: + pass + elif oov_policy[i] == OOV_POLICY_ERROR: + logger.fatal("Unknown token: %s" % features[i]) + else: + vec.ids.append(dim + 0) + + dim += len(dicts[i]) + sample[-1].append(vec) + return sample + + num_features = len(dicts) + f = gzip.open(input_file, 'rb') + + num_sequences = 0 + sequence = [] + for line in f: + line = line.strip() + if not line: + make_features(sequence) + yield gen_sample(sequence) + sequence = [] + num_sequences += 1 + continue + features = line.split(' ') + sequence.append(features) + + f.close() + + logger.info("num_sequences=%s" % num_sequences) + diff --git a/demo/sequence_tagging/linear_crf.py b/demo/sequence_tagging/linear_crf.py new file mode 100644 index 00000000000..2bd1a20bc52 --- /dev/null +++ b/demo/sequence_tagging/linear_crf.py @@ -0,0 +1,84 @@ +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +import math + +define_py_data_sources2(train_list="data/train.list", + test_list="data/test.list", + module="dataprovider", + obj="process") + + +batch_size = 1 +settings( + learning_method=MomentumOptimizer(), + batch_size=batch_size, + regularization=L2Regularization(batch_size * 1e-4), + average_window=0.5, + learning_rate=1e-1, + learning_rate_decay_a=1e-5, + learning_rate_decay_b=0.25, +) + +num_label_types=23 + +def get_simd_size(size): + return int(math.ceil(float(size) / 8)) * 8 + +# Currently, in order to use sparse_update=True, +# the size has to be aligned. +num_label_types = get_simd_size(num_label_types) + +features = data_layer(name="features", size=76328) +word = data_layer(name="word", size=6778) +pos = data_layer(name="pos", size=44) +chunk = data_layer(name="chunk", + size=num_label_types) + +crf_input = fc_layer( + input=features, + size=num_label_types, + act=LinearActivation(), + bias_attr=False, + param_attr=ParamAttr(initial_std=0, sparse_update=True)) + +crf=crf_layer( + input=crf_input, + label=chunk, + param_attr=ParamAttr(name="crfw", initial_std=0), +) + +crf_decoding=crf_decoding_layer( + size=num_label_types, + input=crf_input, + label=chunk, + param_attr=ParamAttr(name="crfw"), +) + +sum_evaluator( + name="error", + input=crf_decoding, +) + +chunk_evaluator( + name="chunk_f1", + input =[crf_decoding, chunk], + chunk_scheme="IOB", + num_chunk_types=11, +) + +inputs(word, pos, chunk, features) +outputs(crf) diff --git a/demo/sequence_tagging/readme.md b/demo/sequence_tagging/readme.md new file mode 100644 index 00000000000..2e17fffb83c --- /dev/null +++ b/demo/sequence_tagging/readme.md @@ -0,0 +1,45 @@ +# Sequence Tagging + +This demo is a sequence model for assigning tags to each token in a sentence. The task is described at CONLL2000 Text Chunking task. + +## Download data +```bash +cd demo/sequence_tagging +./data/get_data.sh +``` + +## Train model +```bash +cd demo/sequence_tagging +./train.sh +``` + +## Model description + +We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at leon.bottou.org/projects/sgd. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py). +
+ + + + + + + + + + + + + + + + + + + + + + +
Model nameNumber of parametersF1 score
linear_crf 1.8M 0.937
rnn_crf 960K 0.941
+
+
diff --git a/demo/sequence_tagging/rnn_crf.py b/demo/sequence_tagging/rnn_crf.py new file mode 100644 index 00000000000..fb157bf3ea7 --- /dev/null +++ b/demo/sequence_tagging/rnn_crf.py @@ -0,0 +1,130 @@ +# Copyright (c) 2016 Baidu, Inc. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.trainer_config_helpers import * + +import math + +define_py_data_sources2(train_list="data/train.list", + test_list="data/test.list", + module="dataprovider", + obj="process") + +batch_size = 16 +settings( + learning_method=MomentumOptimizer(), + batch_size=batch_size, + regularization=L2Regularization(batch_size * 1e-5), + average_window=0.5, + learning_rate = 2e-3, + learning_rate_decay_a = 5e-7, + learning_rate_decay_b = 0.5, +) + +word_dim=128 +hidden_dim = 128 +with_rnn = True + +initial_std=1/math.sqrt(hidden_dim) +param_attr=ParamAttr(initial_std=initial_std) +cpu_layer_attr=ExtraLayerAttribute(device=-1) + +default_device(0) + +num_label_types=23 + +features = data_layer(name="features", size=76328) +word = data_layer(name="word", size=6778) +pos = data_layer(name="pos", size=44) +chunk = data_layer(name="chunk", + size=num_label_types, + layer_attr=cpu_layer_attr) + +emb = embedding_layer( + input=word, size=word_dim, param_attr=ParamAttr(initial_std=0)) + +hidden1 = mixed_layer( + size=hidden_dim, + act=STanhActivation(), + bias_attr=True, + input=[full_matrix_projection(emb), + table_projection(pos, param_attr=param_attr)] +) + +if with_rnn: + rnn1 = recurrent_layer( + act=ReluActivation(), + bias_attr=True, + input=hidden1, + param_attr=ParamAttr(initial_std=0), + ) + +hidden2 = mixed_layer( + size=hidden_dim, + act=STanhActivation(), + bias_attr=True, + input=[full_matrix_projection(hidden1) + ] + ([ + full_matrix_projection(rnn1, param_attr=ParamAttr(initial_std=0)) + ] if with_rnn else []), +) + +if with_rnn: + rnn2=recurrent_layer( + reverse=True, + act=ReluActivation(), + bias_attr=True, + input=hidden2, + param_attr=ParamAttr(initial_std=0), + ) + +crf_input = mixed_layer( + size=num_label_types, + bias_attr=False, + input=[ + full_matrix_projection(hidden2), + ] + ([ + full_matrix_projection(rnn2, param_attr=ParamAttr(initial_std=0)) + ] if with_rnn else []), +) + +crf = crf_layer( + input=crf_input, + label=chunk, + param_attr=ParamAttr(name="crfw", initial_std=0), + layer_attr=cpu_layer_attr, +) + +crf_decoding = crf_decoding_layer( + size=num_label_types, + input=crf_input, + label=chunk, + param_attr=ParamAttr(name="crfw"), + layer_attr=cpu_layer_attr, +) + +sum_evaluator( + name="error", + input=crf_decoding, +) + +chunk_evaluator( + name="chunk_f1", + input =[crf_decoding, chunk], + chunk_scheme="IOB", + num_chunk_types=11, +) + +inputs(word, pos, chunk, features) +outputs(crf) diff --git a/demo/sequence_tagging/train.sh b/demo/sequence_tagging/train.sh new file mode 100755 index 00000000000..9a706b98d86 --- /dev/null +++ b/demo/sequence_tagging/train.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +paddle train \ + --config rnn_crf.py \ + --parallel_nn=1 \ + --use_gpu=1 \ + --dot_period=10 \ + --log_period=1000 \ + --test_period=0 \ + --num_passes=10 diff --git a/demo/sequence_tagging/train_linear.sh b/demo/sequence_tagging/train_linear.sh new file mode 100755 index 00000000000..597b5afea9c --- /dev/null +++ b/demo/sequence_tagging/train_linear.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +paddle train \ + --config linear_crf.py \ + --use_gpu=0 \ + --dot_period=100 \ + --log_period=10000 \ + --test_period=0 \ + --num_passes=10 diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py index 4660a6b5003..d4b947517b7 100644 --- a/python/paddle/trainer_config_helpers/optimizers.py +++ b/python/paddle/trainer_config_helpers/optimizers.py @@ -362,6 +362,13 @@ def __extends__(dict1, dict2): default_factory=lambda _: BaseRegularization()) def settings(batch_size, learning_rate=1e-3, + learning_rate_decay_a=0., + learning_rate_decay_b=0., + learning_rate_schedule='poly', + learning_rate_args='', + average_window=0, + do_average_in_cpu=False, + max_average_window=None, learning_method=None, regularization=None, is_async=False, @@ -408,10 +415,14 @@ def settings(batch_size, else: algorithm = 'owlqn' + args=['batch_size', 'learning_rate', 'learning_rate_decay_a', + 'learning_rate_decay_b', 'learning_rate_schedule', + 'learning_rate_args', 'average_window', 'do_average_in_cpu', + 'max_average_window'] kwargs = dict() - kwargs['batch_size'] = batch_size - kwargs['learning_rate'] = learning_rate kwargs['algorithm'] = algorithm + for arg in args: + kwargs[arg] = locals()[arg] kwargs = __extends__(kwargs, learning_method.to_setting_kwargs()) learning_method.extra_settings() -- GitLab