diff --git a/models/rank/criteo_reader.py b/models/rank/criteo_reader.py deleted file mode 100755 index 75994fb43f6ee3a72ab2aae25c36e0591c530fee..0000000000000000000000000000000000000000 --- a/models/rank/criteo_reader.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -from paddlerec.core.reader import Reader -from paddlerec.core.utils import envs - - -class TrainReader(Reader): - def init(self): - self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - self.cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50] - self.cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50] - self.hash_dim_ = envs.get_global_env("hyper_parameters.sparse_feature_number", None, "train.model") - self.continuous_range_ = range(1, 14) - self.categorical_range_ = range(14, 40) - - def generate_sample(self, line): - """ - Read the data line by line and process it as a dictionary - """ - - def reader(): - """ - This function needs to be implemented by the user, based on data format - """ - features = line.rstrip('\n').split('\t') - - dense_feature = [] - sparse_feature = [] - for idx in self.continuous_range_: - if features[idx] == "": - dense_feature.append(0.0) - else: - dense_feature.append( - (float(features[idx]) - self.cont_min_[idx - 1]) / - self.cont_diff_[idx - 1]) - - for idx in self.categorical_range_: - sparse_feature.append( - [hash(str(idx) + features[idx]) % self.hash_dim_]) - label = [int(features[0])] - feature_name = ["D"] - for idx in self.categorical_range_: - feature_name.append("S" + str(idx - 13)) - feature_name.append("label") - yield zip(feature_name, [dense_feature] + sparse_feature + [label]) - - return reader diff --git a/models/rank/readme.md b/models/rank/readme.md index 31b08d36a8fbd82a668d63ff21b2e9c77179c000..bbcf32ea1fd60c372b0496bd893b89dcabdf5e8d 100755 --- a/models/rank/readme.md +++ b/models/rank/readme.md @@ -64,6 +64,8 @@ sh run.sh ``` +数据读取默认使用core/reader.py + ### 训练 ``` python -m paddlerec.run -m paddlerec.models.rank.dnn # 以DNN为例