reader.py

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Reader for auto dialogue evaluation"""

import sys
import time
import random
import numpy as np

import paddle
import paddle.fluid as fluid


class DataProcessor(object): 
    def __init__(self, data_path, max_seq_length, batch_size): 
        """init"""
        self.data_file = data_path
        self.max_seq_len = max_seq_length
        self.batch_size = batch_size
        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}

    def get_examples(self): 
        """load examples"""
        examples = []
        index = 0
        with open(self.data_file, 'r') as fr: 
            for line in fr: 
                if index !=0 and index % 100 == 0: 
                    print("processing data: %d" % index)
                index += 1
                examples.append(line.strip())
        return examples

    def get_num_examples(self, phase): 
        """Get number of examples for train, dev or test."""
        if phase not in ['train', 'dev', 'test']: 
            raise ValueError(
                "Unknown phase, which should be in ['train', 'dev', 'test'].")
        count = len(open(self.data_file,'rU').readlines())
        self.num_examples[phase] = count
        return self.num_examples[phase]

    def data_generator(self,
                       place,
                       phase="train",
                       shuffle=True,
                       sample_pro=1):
        """
        Generate data for train, dev or test.

        Args:
            phase: string. The phase for which to generate data.
            shuffle: bool. Whether to shuffle examples.
            sample_pro: sample data ratio
        """
        examples = self.get_examples()
        if shuffle: 
            np.random.shuffle(examples)
        
        def batch_reader():  
            """read batch data"""
            batch = []
            for example in examples: 
                if sample_pro < 1:
                    if random.random() > sample_pro:
                        continue
                tokens = example.strip().split('\t')
                
                if len(tokens) != 3: 
                    print("data format error: %s" % example.strip())
                    print("please input data: context \t response \t label")
                    continue

                context = [int(x) for x in tokens[0].split()[: self.max_seq_len]]
                response = [int(x) for x in tokens[1].split()[: self.max_seq_len]]
                label = [int(tokens[2])]
                instance = (context, response, label)

                if len(batch) < self.batch_size:
                    batch.append(instance)
                else:
                    if len(batch) == self.batch_size:
                        yield batch
                    batch = [instance]

            if len(batch) > 0: 
                yield batch

        def create_lodtensor(data_ids, place): 
            """create LodTensor for input ids"""
            cur_len = 0
            lod = [cur_len]
            seq_lens = [len(ids) for ids in data_ids]
            for l in seq_lens: 
                cur_len += l
                lod.append(cur_len)
            flattened_data = np.concatenate(data_ids, axis=0).astype("int64")
            flattened_data = flattened_data.reshape([len(flattened_data), 1])
            res = fluid.LoDTensor()
            res.set(flattened_data, place)
            res.set_lod([lod])
            return res

        def wrapper(): 
            """yield batch data to network""" 
            for batch_data in batch_reader(): 
                context_ids = [batch[0] for batch in batch_data]
                response_ids = [batch[1] for batch in batch_data]
                label_ids = [batch[2] for batch in batch_data]
                context_res = create_lodtensor(context_ids, place)
                response_res = create_lodtensor(response_ids, place)
                label_ids = np.array(label_ids).astype("int64").reshape([-1, 1])
                input_batch = [context_res, response_res, label_ids]
                yield input_batch
        
        return wrapper