From f976343af51de03ac6ef5a7a7ae8162c7bb0c591 Mon Sep 17 00:00:00 2001 From: wangjiawei04 Date: Wed, 10 Feb 2021 09:20:49 +0000 Subject: [PATCH] fix criteo ctr --- python/examples/criteo_ctr/README.md | 2 +- python/examples/criteo_ctr/README_CN.md | 2 +- python/examples/criteo_ctr/criteo_reader.py | 83 --------------------- python/examples/criteo_ctr/test_client.py | 58 +++++++++----- 4 files changed, 41 insertions(+), 104 deletions(-) delete mode 100644 python/examples/criteo_ctr/criteo_reader.py diff --git a/python/examples/criteo_ctr/README.md b/python/examples/criteo_ctr/README.md index 977ba56d..2e9c5c53 100644 --- a/python/examples/criteo_ctr/README.md +++ b/python/examples/criteo_ctr/README.md @@ -26,6 +26,6 @@ python -m paddle_serving_server_gpu.serve --model ctr_serving_model/ --port 9292 ### RPC Infer ``` -python test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/ +python test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0 ``` the latency will display in the end. diff --git a/python/examples/criteo_ctr/README_CN.md b/python/examples/criteo_ctr/README_CN.md index 0baf29ff..0fd8fd5e 100644 --- a/python/examples/criteo_ctr/README_CN.md +++ b/python/examples/criteo_ctr/README_CN.md @@ -26,6 +26,6 @@ python -m paddle_serving_server_gpu.serve --model ctr_serving_model/ --port 9292 ### 执行预测 ``` -python test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/ +python test_client.py ctr_client_conf/serving_client_conf.prototxt raw_data/part-0 ``` 预测完毕会输出预测过程的耗时。 diff --git a/python/examples/criteo_ctr/criteo_reader.py b/python/examples/criteo_ctr/criteo_reader.py deleted file mode 100644 index 2a80af78..00000000 --- a/python/examples/criteo_ctr/criteo_reader.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# pylint: disable=doc-string-missing - -import sys -import paddle.fluid.incubate.data_generator as dg - - -class CriteoDataset(dg.MultiSlotDataGenerator): - def setup(self, sparse_feature_dim): - self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - self.cont_max_ = [ - 20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50 - ] - self.cont_diff_ = [ - 20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50 - ] - self.hash_dim_ = sparse_feature_dim - # here, training data are lines with line_index < train_idx_ - self.train_idx_ = 41256555 - self.continuous_range_ = range(1, 14) - self.categorical_range_ = range(14, 40) - - def _process_line(self, line): - features = line.rstrip('\n').split('\t') - dense_feature = [] - sparse_feature = [] - for idx in self.continuous_range_: - if features[idx] == '': - dense_feature.append(0.0) - else: - dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \ - self.cont_diff_[idx - 1]) - for idx in self.categorical_range_: - sparse_feature.append( - [hash(str(idx) + features[idx]) % self.hash_dim_]) - - return dense_feature, sparse_feature, [int(features[0])] - - def infer_reader(self, filelist, batch, buf_size): - def local_iter(): - for fname in filelist: - with open(fname.strip(), "r") as fin: - for line in fin: - dense_feature, sparse_feature, label = self._process_line( - line) - #yield dense_feature, sparse_feature, label - yield [dense_feature] + sparse_feature + [label] - - import paddle - batch_iter = paddle.batch( - paddle.reader.shuffle( - local_iter, buf_size=buf_size), - batch_size=batch) - return batch_iter - - def generate_sample(self, line): - def data_iter(): - dense_feature, sparse_feature, label = self._process_line(line) - feature_name = ["dense_input"] - for idx in self.categorical_range_: - feature_name.append("C" + str(idx - 13)) - feature_name.append("label") - yield zip(feature_name, [dense_feature] + sparse_feature + [label]) - - return data_iter - - -if __name__ == "__main__": - criteo_dataset = CriteoDataset() - criteo_dataset.setup(int(sys.argv[1])) - criteo_dataset.run_from_stdin() diff --git a/python/examples/criteo_ctr/test_client.py b/python/examples/criteo_ctr/test_client.py index ecb2fc37..fd6c6e03 100644 --- a/python/examples/criteo_ctr/test_client.py +++ b/python/examples/criteo_ctr/test_client.py @@ -14,43 +14,63 @@ # pylint: disable=doc-string-missing from paddle_serving_client import Client -import paddle import sys import os import time -import criteo_reader as criteo from paddle_serving_client.metric import auc import numpy as np import sys +class CriteoReader(object): + def __init__(self, sparse_feature_dim): + self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + self.cont_max_ = [ + 20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50 + ] + self.cont_diff_ = [ + 20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50 + ] + self.hash_dim_ = sparse_feature_dim + # here, training data are lines with line_index < train_idx_ + self.train_idx_ = 41256555 + self.continuous_range_ = range(1, 14) + self.categorical_range_ = range(14, 40) + + def process_line(self, line): + features = line.rstrip('\n').split('\t') + dense_feature = [] + sparse_feature = [] + for idx in self.continuous_range_: + if features[idx] == '': + dense_feature.append(0.0) + else: + dense_feature.append((float(features[idx]) - self.cont_min_[idx - 1]) / \ + self.cont_diff_[idx - 1]) + for idx in self.categorical_range_: + sparse_feature.append( + [hash(str(idx) + features[idx]) % self.hash_dim_]) + + return sparse_feature + py_version = sys.version_info[0] client = Client() client.load_client_config(sys.argv[1]) client.connect(["127.0.0.1:9292"]) - +reader = CriteoReader(1000001) batch = 1 buf_size = 100 -dataset = criteo.CriteoDataset() -dataset.setup(1000001) -test_filelists = [ - "{}/part-%d".format(sys.argv[2]) % x - for x in range(len(os.listdir(sys.argv[2]))) -] -reader = dataset.infer_reader(test_filelists[len(test_filelists) - 40:], batch, - buf_size) label_list = [] prob_list = [] start = time.time() -for ei in range(1000): - if py_version == 2: - data = reader().next() - else: - data = reader().__next__() +f = open(sys.argv[2], 'r') +for ei in range(10): + data = reader.process_line(f.readline()) feed_dict = {} for i in range(1, 27): - feed_dict["sparse_{}".format(i - 1)] = np.array(data[0][i]).reshape(-1) - feed_dict["sparse_{}.lod".format(i - 1)] = [0, len(data[0][i])] + feed_dict["sparse_{}".format(i - 1)] = np.array(data[i-1]).reshape(-1) + feed_dict["sparse_{}.lod".format(i - 1)] = [0, len(data[i-1])] fetch_map = client.predict(feed=feed_dict, fetch=["prob"]) + print(fetch_map) end = time.time() -print(end - start) +f.close() -- GitLab