sentimental_provider.py 1.5 KB
Newer Older
Z
zhangjinchao01 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
from paddle.trainer.PyDataProvider2 import *


def on_init(settings, dictionary, **kwargs):
    # on_init will invoke when data provider is initialized. The dictionary
    # is passed from trainer_config, and is a dict object with type
    # (word string => word id).

    # set input types in runtime. It will do the same thing as
    # @provider(input_types) will do, but it is set dynamically during runtime.
    settings.input_types = [
        # The text is a sequence of integer values, and each value is a word id.
        # The whole sequence is the sentences that we want to predict its
        # sentimental.
15 16
        integer_value(
            len(dictionary), seq_type=SequenceType),  # text input
Z
zhangjinchao01 已提交
17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46

        # label positive/negative
        integer_value(2)
    ]

    # save dictionary as settings.dictionary. It will be used in process
    # method.
    settings.dictionary = dictionary


@provider(init_hook=on_init)
def process(settings, filename):
    f = open(filename, 'r')

    for line in f:  # read each line of file
        label, sentence = line.split('\t')  # get label and sentence
        words = sentence.split(' ')  # get words

        # convert word string to word id
        # the word not in dictionary will be ignored.
        word_ids = []

        for each_word in words:
            if each_word in settings.dictionary:
                word_ids.append(settings.dictionary[each_word])

        # give data to paddle.
        yield word_ids, int(label)

    f.close()