gen_proto_data.py

# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from cStringIO import StringIO

import paddle.proto.DataFormat_pb2 as DataFormat
from google.protobuf.internal.encoder import _EncodeVarint

import logging
import pprint

logging.basicConfig(
    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
)
logger = logging.getLogger('paddle')
logger.setLevel(logging.INFO)

OOV_POLICY_IGNORE = 0
OOV_POLICY_USE = 1
OOV_POLICY_ERROR = 2

num_original_columns = 3

# Feature combination patterns.
# [[-1,0], [0,0]]  means previous token at column 0 and current token at
# column 0 are combined as one feature.
patterns = [
    [[-2,0]],
    [[-1,0]],
    [[0,0]],
    [[1,0]],
    [[2,0]],

    [[-1,0], [0,0]],
    [[0,0], [1,0]],

    [[-2,1]],
    [[-1,1]],
    [[0,1]],
    [[1,1]],
    [[2,1]],
    [[-2,1], [-1,1]],
    [[-1,1], [0,1]],
    [[0,1], [1,1]],
    [[1,1], [2,1]],

    [[-2,1], [-1,1], [0,1]],
    [[-1,1], [0,1], [1,1]],
    [[0,1], [1,1], [2,1]],
]

def make_features(sequence):
    length = len(sequence)
    num_features = len(sequence[0])
    def get_features(pos):
        if pos < 0:
            return ['#B%s' % -pos] * num_features
        if pos >= length:
            return ['#E%s' % (pos - length + 1)] * num_features
        return sequence[pos]

    for i in xrange(length):
        for pattern in patterns:
            fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern])
            sequence[i].append(fname)

'''
Source file format:
Each line is for one timestep. The features are separated by space.
An empty line indicates end of a sequence.

cutoff: a list of numbers. If count of a feature is smaller than this,
 it will be ignored.
if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
i-th column.

return a list of dict for each column
'''
def create_dictionaries(filename, cutoff, oov_policy):
    def add_to_dict(sequence, dicts):
        num_features = len(dicts)
        for features in sequence:
            l = len(features)
            assert l == num_features, "Wrong number of features " + line
            for i in xrange(l):
                if features[i] in dicts[i]:
                    dicts[i][features[i]] += 1
                else:
                    dicts[i][features[i]] = 1

    num_features = len(cutoff)
    dicts = []
    for i in xrange(num_features):
        dicts.append(dict())

    f = open(filename, 'rb')

    sequence = []

    for line in f:
        line = line.strip()
        if not line:
            make_features(sequence)
            add_to_dict(sequence, dicts)
            sequence = []
            continue
        features = line.split(' ')
        sequence.append(features)


    for i in xrange(num_features):
        dct = dicts[i]
        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
        todo = []
        for k, v in dct.iteritems():
            if v < cutoff[i]:
                todo.append(k)
            else:
                dct[k] = n
                n += 1

        if oov_policy[i] == OOV_POLICY_USE:
            # placeholder so that len(dct) will be the number of features
            # including OOV
            dct['#OOV#'] = 0

        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
        for k in todo:
            del dct[k]

    f.close()
    return dicts


def encode_varint(v):
    out = StringIO()
    _EncodeVarint(out.write, v)
    return out.getvalue()


def write_proto(file, message):
    s = message.SerializeToString()
    packed_len = encode_varint(len(s))
    file.write(packed_len + s)


'''
if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
existed in dicts[i] will be assigned to id 0.
if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
in dicts[i].
'''
def gen_proto_file(
        input_file,
        dicts,
        oov_policy,
        output_file):

    def write_sequence(out, sequence):
        num_features = len(dicts)
        is_beginning = True
        for features in sequence:
            assert len(features) == num_features, \
                "Wrong number of features: " + line
            sample = DataFormat.DataSample()
            for i in xrange(num_original_columns):
                id = dicts[i].get(features[i], -1)
                if id != -1:
                    sample.id_slots.append(id)
                elif oov_policy[i] == OOV_POLICY_IGNORE:
                    sample.id_slots.append(0xffffffff)
                elif oov_policy[i] == OOV_POLICY_ERROR:
                    logger.fatal("Unknown token: %s" % features[i])
                else:
                    sample.id_slots.append(0)

            if patterns:
                dim = 0
                vec = sample.vector_slots.add()
                for i in xrange(num_original_columns, num_features):
                    id = dicts[i].get(features[i], -1)
                    if id != -1:
                        vec.ids.append(dim + id)
                    elif oov_policy[i] == OOV_POLICY_IGNORE:
                        pass
                    elif oov_policy[i] == OOV_POLICY_ERROR:
                        logger.fatal("Unknown token: %s" % features[i])
                    else:
                        vec.ids.append(dim + 0)

                    dim += len(dicts[i])

            sample.is_beginning = is_beginning
            is_beginning = False
            write_proto(out, sample)

    num_features = len(dicts)
    f = open(input_file, 'rb')
    out = open(output_file, 'wb')

    header = DataFormat.DataHeader()
    if patterns:
        slot_def = header.slot_defs.add()
        slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE
        slot_def.dim = sum([len(dicts[i])
                            for i in xrange(num_original_columns, len(dicts))])
        logger.info("feature_dim=%s" % slot_def.dim)

    for i in xrange(num_original_columns):
        slot_def = header.slot_defs.add()
        slot_def.type = DataFormat.SlotDef.INDEX
        slot_def.dim = len(dicts[i])

    write_proto(out, header)

    num_sequences = 0
    sequence = []
    for line in f:
        line = line.strip()
        if not line:
            make_features(sequence)
            write_sequence(out, sequence)
            sequence = []
            num_sequences += 1
            continue
        features = line.split(' ')
        sequence.append(features)

    f.close()
    out.close()

    logger.info("num_sequences=%s" % num_sequences)

dict2 = {
 'B-ADJP': 0,
 'I-ADJP': 1,
 'B-ADVP': 2,
 'I-ADVP': 3,
 'B-CONJP': 4,
 'I-CONJP': 5,
 'B-INTJ': 6,
 'I-INTJ': 7,
 'B-LST': 8,
 'I-LST': 9,
 'B-NP': 10,
 'I-NP': 11,
 'B-PP': 12,
 'I-PP': 13,
 'B-PRT': 14,
 'I-PRT': 15,
 'B-SBAR': 16,
 'I-SBAR': 17,
 'B-UCP': 18,
 'I-UCP': 19,
 'B-VP': 20,
 'I-VP': 21,
 'O': 22
}

if __name__ == '__main__':
    cutoff = [3, 1, 0]
    cutoff += [3] * len(patterns)
    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
    dicts = create_dictionaries(
        'trainer/tests/train.txt', cutoff, oov_policy)
    dicts[2] = dict2
    gen_proto_file(
        'trainer/tests/train.txt',
        dicts,
        oov_policy,
        'trainer/tests/train_proto.bin')
    gen_proto_file(
        'trainer/tests/test.txt',
        dicts,
        oov_policy,
        'trainer/tests/test_proto.bin')