dataprovider.py

# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from paddle.trainer.PyDataProvider2 import *


#Define a data provider for "query relationship"
@provider(
    input_types={
        'features1': dense_vector(46),
        'features2': dense_vector(46),
        'label': dense_vector(1)
    },
    should_shuffle=False,
    cache=CacheType.CACHE_PASS_IN_MEM)
def process(settings, file_name):
    with open(file_name) as f:
        pre_qid = -1
        feats1 = []
        feats2 = []
        l1 = 0
        l2 = 0
        for line in f:
            line = line.split('#')[0]
            if len(line.split()) < 48:
                continue
            qid = int(line.split()[1].split(':')[1])
            if pre_qid != qid:
                feats1 = []
                for term in line.split()[2:48]:
                    feats1.append(float(term.split(':')[1]))
                l1 = int(line.split()[0])
                pre_qid = qid
                feats2 = feats1
                yield feats1, feats2, [0.5]
            else:
                feats1 = feats2
                feats2 = []
                l1 = l2
                for term in line.split()[2:48]:
                    feats2.append(float(term.split(':')[1]))
                l2 = int(line.split()[0])
                p12 = 0.5
                if l1 > l2:
                    p12 = 1
                if l1 < l2:
                    p12 = 0
                yield feats1, feats2, [p12]


@provider(input_types={'features': dense_vector(46)})
def process_predict(settings, file_name):
    with open(file_name) as f:
        for line in f:
            feats = []
            line = line.split('#')[0]
            for term in line.split()[2:48]:
                feats.append(float(term.split(':')[1]))
            yield feats