reader.py 1.3 KB
Newer Older
C
caoying03 已提交
1 2
#!/usr/bin/env python
#coding=utf-8
C
caoying03 已提交
3

C
caoying03 已提交
4 5 6
import os
import random
import json
C
caoying03 已提交
7 8 9 10
import logging

logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
C
caoying03 已提交
11 12


C
caoying03 已提交
13
def data_reader(data_list, is_train=True):
C
caoying03 已提交
14 15 16 17 18 19 20 21
    """ Data reader.

    Arguments:
        - data_list:  A python list which contains path of training samples.
        - is_train:   A boolean parameter indicating this function is called
                      in training or in inferring.
    """

C
caoying03 已提交
22
    def reader():
C
caoying03 已提交
23
        """shuffle the data list again at the begining of every pass"""
C
caoying03 已提交
24 25 26 27 28 29
        if is_train:
            random.shuffle(data_list)

        for train_sample in data_list:
            data = json.load(open(train_sample, "r"))

C
caoying03 已提交
30 31 32 33 34 35 36 37 38 39 40 41 42
            start_pos = 0
            doc = []
            same_as_question_word = []
            for l in data['sent_lengths']:
                doc.append(data['context'][start_pos:start_pos + l])
                same_as_question_word.append([
                    [[x]] for x in data['same_as_question_word']
                ][start_pos:start_pos + l])
                start_pos += l

            yield (data['question'], doc, same_as_question_word,
                   data['ans_sentence'], data['ans_start'],
                   data['ans_end'] - data['ans_start'])
C
caoying03 已提交
43 44

    return reader