map_reader.py

# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import json
import paddle.fluid.incubate.data_generator as dg


class MapDataset(dg.MultiSlotDataGenerator):
    def setup(self, sparse_feature_dim):
        self.profile_length = 65
        self.dense_length = 3
        #feature names
        self.dense_feature_list = ["distance", "price", "eta"]

        self.pid_list = ["pid"]
        self.query_feature_list = ["weekday", "hour", "o1", "o2", "d1", "d2"]
        self.plan_feature_list = ["transport_mode"]
        self.rank_feature_list = ["plan_rank", "whole_rank", "price_rank", "eta_rank", "distance_rank"]
        self.rank_whole_pic_list = ["mode_rank1", "mode_rank2", "mode_rank3", "mode_rank4",
                                    "mode_rank5"]
        self.weather_feature_list = ["max_temp", "min_temp", "wea", "wind"]
        self.hash_dim = 1000001
        self.train_idx_ = 2000000
        #carefully set if you change the features 
        self.categorical_range_ = range(0, 22)

    #process one instance
    def _process_line(self, line):
        instance = json.loads(line)
        """
        profile = instance["profile"]
        len_profile = len(profile)
        if len_profile >= 10:
            user_profile_feature = profile[0:10]
        else:
            profile.extend([0]*(10-len_profile))
            user_profile_feature = profile
        
        if len(profile) > 1 or (len(profile) == 1 and profile[0] != 0):
            for p in profile:
                if p >= 1 and p <= 65:
                    user_profile_feature[p - 1] = 1
        """
        context_feature = []
        context_feature_fm = []
        dense_feature = [0] * self.dense_length
        plan = instance["plan"]
        for i, val in enumerate(self.dense_feature_list):
            dense_feature[i] = plan[val]

        if (instance["pid"] == ""):
            instance["pid"] = 0

        query = instance["query"]
        weather_dic = instance["weather"]
        for fea in self.pid_list:
            context_feature.append([hash(fea + str(instance[fea])) % self.hash_dim])
            context_feature_fm.append(hash(fea + str(instance[fea])) % self.hash_dim)
        for fea in self.query_feature_list:
            context_feature.append([hash(fea + str(query[fea])) % self.hash_dim])
            context_feature_fm.append(hash(fea + str(query[fea])) % self.hash_dim)
        for fea in self.plan_feature_list:
            context_feature.append([hash(fea + str(plan[fea])) % self.hash_dim])
            context_feature_fm.append(hash(fea + str(plan[fea])) % self.hash_dim)
        for fea in self.rank_feature_list:
            context_feature.append([hash(fea + str(instance[fea])) % self.hash_dim])
            context_feature_fm.append(hash(fea + str(instance[fea])) % self.hash_dim)
        for fea in self.rank_whole_pic_list:
            context_feature.append([hash(fea + str(instance[fea])) % self.hash_dim])
            context_feature_fm.append(hash(fea + str(instance[fea])) % self.hash_dim)
        for fea in self.weather_feature_list:
            context_feature.append([hash(fea + str(weather_dic[fea])) % self.hash_dim])
            context_feature_fm.append(hash(fea + str(weather_dic[fea])) % self.hash_dim)

        label = [int(instance["label"])]

        return dense_feature, context_feature, context_feature_fm, label

    def infer_reader(self, filelist, batch, buf_size):
        print(filelist)

        def local_iter():
            for fname in filelist:
                with open(fname.strip(), "r") as fin:
                    for line in fin:
                        dense_feature, sparse_feature, sparse_feature_fm, label = self._process_line(line)
                        yield [dense_feature] + sparse_feature + [sparse_feature_fm] + [label]

        import paddle
        batch_iter = paddle.batch(
            paddle.reader.shuffle(
                local_iter, buf_size=buf_size),
            batch_size=batch)
        return batch_iter

    #generat inputs for testing
    def test_reader(self, filelist, batch, buf_size):
        print(filelist)

        def local_iter():
            for fname in filelist:
                with open(fname.strip(), "r") as fin:
                    for line in fin:
                        dense_feature, sparse_feature, sparse_feature_fm, label = self._process_line(line)
                        yield [dense_feature] + sparse_feature + [sparse_feature_fm] + [label]

        import paddle
        batch_iter = paddle.batch(
            paddle.reader.buffered(
                local_iter, size=buf_size),
            batch_size=batch)
        return batch_iter

    #generate inputs for trainig 
    def generate_sample(self, line):
        def data_iter():
            dense_feature, sparse_feature, sparse_feature_fm, label = self._process_line(line)
            #feature_name = ["user_profile"]
            feature_name = []
            feature_name.append("dense_feature")
            for idx in self.categorical_range_:
                feature_name.append("context" + str(idx))
            feature_name.append("context_fm")
            feature_name.append("label")
            yield zip(feature_name, [dense_feature] + sparse_feature + [sparse_feature_fm] + [label])

        return data_iter


if __name__ == "__main__":
    map_dataset = MapDataset()
    map_dataset.setup(int(sys.argv[1]))
    map_dataset.run_from_stdin()