criteo_reader.py 1.3 KB
Newer Older
C
CandyCaneLane 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
import sys
import paddle.fluid.incubate.data_generator as dg
import pickle
from collections import Counter
import os


class CriteoDataset(dg.MultiSlotDataGenerator):
    def _process_line(self, line):
        features = line.strip('\n').split('\t')
        feat_idx = []
        feat_value = []
        for idx in range(1, 40):
            feat_idx.append(int(features[idx]))
            feat_value.append(1.0)
        label = [int(features[0])]
        return feat_idx, feat_value, label

    def test(self, filelist):
        def local_iter():
            for fname in filelist:
                with open(fname.strip(), 'r') as fin:
                    for line in fin:
                        feat_idx, feat_value, label = self._process_line(line)
                        yield [feat_idx, feat_value, label]

        return local_iter

    def generate_sample(self, line):
        def data_iter():
            feat_idx, feat_value, label = self._process_line(line)
            feature_name = ['feat_idx', 'feat_value', 'label']
            yield [('feat_idx', feat_idx), ('feat_value', feat_value), ('label',
                                                                        label)]

        return data_iter


if __name__ == '__main__':
    criteo_dataset = CriteoDataset()
    criteo_dataset.run_from_stdin()