fix

58923013 · xujiaqi01 · c3b28513 · 58923013 · 58923013 · 58923013
20 changed file
--- a/models/rank/dcn/config.yaml
+++ b/models/rank/dcn/config.yaml
@@ -22,7 +22,7 @@ train:

  reader:
    batch_size: 2
-    train_data_path: "{workspace}/slot_data/train"
+    train_data_path: "{workspace}/data/slot_train"
    feat_dict_name: "{workspace}/data/vocab"
    sparse_slots: "label C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26"
    dense_slots: "I1:1 I2:1 I3:1 I4:1 I5:1 I6:1 I7:1 I8:1 I9:1 I10:1 I11:1 I12:1 I13:1"
@@ -35,7 +35,7 @@ train:
      l2_reg_cross: 0.00005
      dnn_use_bn: False
      clip_by_norm: 100.0
-      cat_feat_num: "{workspace}/slot_data/cat_feature_num.txt"
+      cat_feat_num: "{workspace}/data/cat_feature_num.txt"
      is_sparse: False
      is_test: False
      num_field: 39

--- a/models/rank/dcn/data/get_slot_data.py
+++ b/models/rank/dcn/data/get_slot_data.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import sys
+import yaml
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+from collections import Counter
+import os
+import paddle.fluid.incubate.data_generator as dg
+
+class TrainReader(dg.MultiSlotDataGenerator):
+
+    def __init__(self, config):
+        dg.MultiSlotDataGenerator.__init__(self)
+
+        if os.path.isfile(config):
+            with open(config, 'r') as rb:
+                _config = yaml.load(rb.read(), Loader=yaml.FullLoader)
+        else:
+            raise ValueError("reader config only support yaml")
+
+    def init(self):
+        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        self.cont_max_ = [
+            5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 11,
+            231, 4008, 7393
+        ]
+        self.cont_diff_ = [
+            self.cont_max_[i] - self.cont_min_[i]
+            for i in range(len(self.cont_min_))
+        ]
+        self.cont_idx_ = list(range(1, 14))
+        self.cat_idx_ = list(range(14, 40))
+
+        dense_feat_names = ['I' + str(i) for i in range(1, 14)]
+        sparse_feat_names = ['C' + str(i) for i in range(1, 27)]
+        target = ['label']
+
+        self.label_feat_names = target + dense_feat_names + sparse_feat_names
+
+        self.cat_feat_idx_dict_list = [{} for _ in range(26)]
+        
+        # TODO: set vocabulary dictionary
+        vocab_dir = "./vocab/"
+        for i in range(26):
+            lookup_idx = 1  # remain 0 for default value
+            for line in open(
+                    os.path.join(vocab_dir, 'C' + str(i + 1) + '.txt')):
+                self.cat_feat_idx_dict_list[i][line.strip()] = lookup_idx
+                lookup_idx += 1 
+
+    def _process_line(self, line):
+        features = line.rstrip('\n').split('\t')
+        label_feat_list = [[] for _ in range(40)]
+        for idx in self.cont_idx_:
+            if features[idx] == '':
+                label_feat_list[idx].append(0)
+            else:
+                # 0-1 minmax norm
+                # label_feat_list[idx].append((float(features[idx]) - self.cont_min_[idx - 1]) /
+                #                             self.cont_diff_[idx - 1])
+                # log transform
+                label_feat_list[idx].append(
+                    math.log(4 + float(features[idx]))
+                    if idx == 2 else math.log(1 + float(features[idx])))
+        for idx in self.cat_idx_:
+            if features[idx] == '' or features[
+                    idx] not in self.cat_feat_idx_dict_list[idx - 14]:
+                label_feat_list[idx].append(0)
+            else:
+                label_feat_list[idx].append(self.cat_feat_idx_dict_list[
+                    idx - 14][features[idx]])
+        label_feat_list[0].append(int(features[0]))
+        return label_feat_list
+    
+    def generate_sample(self, line):
+        """
+        Read the data line by line and process it as a dictionary
+        """
+        def data_iter():
+            label_feat_list = self._process_line(line)
+            s = ""
+            for i in list(zip(self.label_feat_names, label_feat_list)):
+                k = i[0]
+                v = i[1]
+                for j in v:
+                    s += " " + k + ":" + str(j)
+            print s.strip()
+            yield None
+
+        return data_iter
+
+reader = TrainReader("../config.yaml")
+reader.init()
+reader.run_from_stdin()
--- a/models/rank/dcn/data/run.sh
+++ b/models/rank/dcn/data/run.sh
+python download.py
+python preprocess.py
+
+mkdir slot_train
+for i in `ls ./train`
+do
+    cat train/$i | python get_slot_data.py > slot_train/$i
+done
+
+mkdir slot_test_valid
+for i in `ls ./test_valid`
+do
+    cat test_valid/$i | python get_slot_data.py > slot_test_valid/$i
+done
--- a/models/rank/deepfm/config.yaml
+++ b/models/rank/deepfm/config.yaml
@@ -22,8 +22,8 @@ train:

  reader:
    batch_size: 2
-    train_data_path: "{workspace}/slot_data/train_data"
-    feat_dict_name: "{workspace}/slot_data/feat_dict_10.pkl2"
+    train_data_path: "{workspace}/data/slot_train_data"
+    feat_dict_name: "{workspace}/data/feat_dict_10.pkl2"
    sparse_slots: "label feat_idx"
    dense_slots: "feat_value:39"


--- a/models/rank/deepfm/data/get_slot_data.py
+++ b/models/rank/deepfm/data/get_slot_data.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+class TrainReader(dg.MultiSlotDataGenerator):
+
+    def __init__(self, config):
+        dg.MultiSlotDataGenerator.__init__(self)
+
+        if os.path.isfile(config):
+            with open(config, 'r') as rb:
+                _config = yaml.load(rb.read(), Loader=yaml.FullLoader)
+        else:
+            raise ValueError("reader config only support yaml")
+
+    def init(self):
+        self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        self.cont_max_ = [
+            5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 46,
+            231, 4008, 7393
+        ]
+        self.cont_diff_ = [
+            self.cont_max_[i] - self.cont_min_[i]
+            for i in range(len(self.cont_min_))
+        ]
+        self.continuous_range_ = range(1, 14)
+        self.categorical_range_ = range(14, 40)
+        # load preprocessed feature dict 
+        self.feat_dict_name = "aid_data/feat_dict_10.pkl2"
+        self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb')) 
+
+    def _process_line(self, line):
+        features = line.rstrip('\n').split('\t')
+        feat_idx = []
+        feat_value = []
+        for idx in self.continuous_range_:
+            if features[idx] == '':
+                feat_idx.append(0)
+                feat_value.append(0.0)
+            else:
+                feat_idx.append(self.feat_dict_[idx])
+                feat_value.append(
+                    (float(features[idx]) - self.cont_min_[idx - 1]) /
+                    self.cont_diff_[idx - 1])
+        for idx in self.categorical_range_:
+            if features[idx] == '' or features[idx] not in self.feat_dict_:
+                feat_idx.append(0)
+                feat_value.append(0.0)
+            else:
+                feat_idx.append(self.feat_dict_[features[idx]])
+                feat_value.append(1.0)
+        label = [int(features[0])]
+        return feat_idx, feat_value, label
+    
+    def generate_sample(self, line):
+        """
+        Read the data line by line and process it as a dictionary
+        """
+        def data_iter():
+            feat_idx, feat_value, label = self._process_line(line)
+            s = ""
+            for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]:
+                k = i[0]
+                v = i[1]
+                for j in v:
+                    s += " " + k + ":" + str(j)
+            print s.strip()
+            yield None
+        return data_iter
+
+reader = TrainReader("../config.yaml")
+reader.init()
+reader.run_from_stdin()
--- a/models/rank/deepfm/data/run.sh
+++ b/models/rank/deepfm/data/run.sh
+python download_preprocess.py 
+
+mkdir slot_train_data
+for i in `ls ./train_data`
+do
+    cat train_data/$i | python get_slot_data.py > slot_train_data/$i
+done
+
+mkdir slot_test_data
+for i in `ls ./test_data`
+do
+    cat test_data/$i | python get_slot_data.py > slot_test_data/$i
+done
--- a/models/rank/dnn/config.yaml
+++ b/models/rank/dnn/config.yaml
@@ -22,7 +22,7 @@ train:

  reader:
    batch_size: 2
-    train_data_path: "{workspace}/slot_data/train"
+    train_data_path: "{workspace}/data/slot_train_data"
    reader_debug_mode: False
    sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
    dense_slots: "dense_var:13"

--- a/models/rank/dnn/data/download.sh
+++ b/models/rank/dnn/data/download.sh
+wget --no-check-certificate https://fleet.bj.bcebos.com/ctr_data.tar.gz
+tar -zxvf ctr_data.tar.gz
+mv ./raw_data ./train_data_full
+mkdir train_data && cd train_data
+cp ../train_data_full/part-0 ../train_data_full/part-1 ./ && cd ..
+mv ./test_data ./test_data_full
+mkdir test_data && cd test_data
+cp ../test_data_full/part-220 ./  && cd ..
+echo "Complete data download."
+echo "Full Train data stored in ./train_data_full "
+echo "Full Test data stored in ./test_data_full "
+echo "Rapid Verification train data stored in ./train_data "
+echo "Rapid Verification test data stored in ./test_data "
--- a/models/rank/dnn/data/get_slot_data.py
+++ b/models/rank/dnn/data/get_slot_data.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.incubate.data_generator as dg
+
+cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
+cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
+hash_dim_ = 1000001
+continuous_range_ = range(1, 14)
+categorical_range_ = range(14, 40)
+
+
+class CriteoDataset(dg.MultiSlotDataGenerator):
+    """
+    DacDataset: inheritance MultiSlotDataGeneratior, Implement data reading
+    Help document: http://wiki.baidu.com/pages/viewpage.action?pageId=728820675
+    """
+
+    def generate_sample(self, line):
+        """
+        Read the data line by line and process it as a dictionary
+        """
+        def reader():
+            """
+            This function needs to be implemented by the user, based on data format
+            """
+            features = line.rstrip('\n').split('\t')
+            dense_feature = []
+            sparse_feature = []
+            for idx in continuous_range_:
+                if features[idx] == "":
+                    dense_feature.append(0.0)
+                else:
+                    dense_feature.append(
+                        (float(features[idx]) - cont_min_[idx - 1]) /
+                        cont_diff_[idx - 1])
+            for idx in categorical_range_:
+                sparse_feature.append(
+                    [hash(str(idx) + features[idx]) % hash_dim_])
+            label = [int(features[0])]
+            process_line = dense_feature, sparse_feature, label
+            feature_name = ["dense_feature"]
+            for idx in categorical_range_:
+                feature_name.append("C" + str(idx - 13))
+            feature_name.append("label")
+            s = "click:" + str(label[0])
+            for i in dense_feature:
+                s += " dense_feature:" + str(i) 
+            for i in range(1, 1 + len(categorical_range_)):
+                s += " " + str(i) + ":" + str(sparse_feature[i-1][0])
+            print s.strip()
+            yield None
+        return reader
+
+
+d = CriteoDataset()
+d.run_from_stdin()
--- a/models/rank/dnn/data/run.sh
+++ b/models/rank/dnn/data/run.sh
+sh download.sh
+
+mkdir slot_train_data_full
+for i in `ls ./train_data_full`
+do
+    cat train_data_full/$i | python get_slot_data.py > slot_train_data_full/$i
+done
+
+mkdir slot_test_data_full
+for i in `ls ./test_data_full`
+do
+    cat test_data_full/$i | python get_slot_data.py > slot_test_data_full/$i
+done
+
+mkdir slot_train_data
+for i in `ls ./train_data`
+do
+    cat train_data/$i | python get_slot_data.py > slot_train_data/$i
+done
+
+mkdir slot_test_data
+for i in `ls ./test_data`
+do
+    cat test_data/$i | python get_slot_data.py > slot_test_data/$i
+done
--- a/models/rank/dnn/data/test/sample_test.txt
+++ b/models/rank/dnn/data/test/sample_test.txt
--- a/models/rank/dnn/data/train/sample_train.txt
+++ b/models/rank/dnn/data/train/sample_train.txt
--- a/models/rank/wide_deep/config.yaml
+++ b/models/rank/wide_deep/config.yaml
@@ -22,7 +22,7 @@ train:

  reader:
    batch_size: 2
-    train_data_path: "{workspace}/slot_data/train_data"
+    train_data_path: "{workspace}/data/slot_train_data"
    sparse_slots: "label"
    dense_slots: "wide_input:8 deep_input:58"


--- a/models/rank/wide_deep/create_data.sh
+++ b/models/rank/wide_deep/create_data.sh
 mkdir train_data
 mkdir test_data
-mkdir data
-train_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/data/adult.data"
-test_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/data/adult.test"
-train_data_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/train_data/train_data.csv"
-test_data_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/test_data/test_data.csv"
+train_path="adult.data"
+test_path="adult.test"
+train_data_path="./train_data/train_data.csv"
+test_data_path="./test_data/test_data.csv"

-#pip install -r requirements.txt
+pip install -r requirements.txt

-#wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
-#wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
+wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
+wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test

 python data_preparation.py --train_path ${train_path} \
                           --test_path ${test_path} \

--- a/models/rank/wide_deep/data/data_preparation.py
+++ b/models/rank/wide_deep/data/data_preparation.py
+import os
+import io
+import args
+import pandas as pd
+from sklearn import  preprocessing
+
+def _clean_file(source_path,target_path):
+    """makes changes to match the CSV format."""
+    with io.open(source_path, 'r') as temp_eval_file:
+        with io.open(target_path, 'w') as eval_file:
+            for line in temp_eval_file:
+                line = line.strip()
+                line = line.replace(', ', ',')
+                if not line or ',' not in line:
+                    continue
+                if line[-1] == '.':
+                    line = line[:-1]
+                line += '\n'
+                eval_file.write(line)
+                    
+def build_model_columns(train_data_path, test_data_path):
+    # The column names are from
+    # https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
+    column_names = [
+    'age', 'workclass', 'fnlwgt', 'education', 'education_num',
+    'marital_status', 'occupation', 'relationship', 'race', 'gender',
+    'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
+    'income_bracket'
+    ]
+
+    # Load the dataset in Pandas
+    train_df = pd.read_csv(
+        train_data_path,
+        delimiter=',',
+        header=None,
+        index_col=None,
+        names=column_names)
+    test_df = pd.read_csv(
+        test_data_path,
+        delimiter=',',
+        header=None,
+        index_col=None,
+        names=column_names)
+
+    # First group of tasks according to the paper
+    #label_columns = ['income_50k', 'marital_stat']
+    categorical_columns = ['education','marital_status','relationship','workclass','occupation']
+    for col in categorical_columns:
+        label_train = preprocessing.LabelEncoder()
+        train_df[col]= label_train.fit_transform(train_df[col])
+        label_test = preprocessing.LabelEncoder()
+        test_df[col]= label_test.fit_transform(test_df[col])
+    
+    bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]  
+    train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(), bins,labels=False)
+    test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(), bins,labels=False)
+    
+    base_columns = ['education', 'marital_status', 'relationship', 'workclass', 'occupation', 'age_buckets']
+    
+    train_df['education_occupation'] = train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str)    
+    test_df['education_occupation'] = test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str)
+    train_df['age_buckets_education_occupation'] = train_df['age_buckets'].astype(str) + '_' + train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str)
+    test_df['age_buckets_education_occupation'] = test_df['age_buckets'].astype(str) + '_' + test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str)
+    crossed_columns = ['education_occupation','age_buckets_education_occupation']
+    
+    for col in crossed_columns:
+        label_train = preprocessing.LabelEncoder()
+        train_df[col]= label_train.fit_transform(train_df[col])
+        label_test = preprocessing.LabelEncoder()
+        test_df[col]= label_test.fit_transform(test_df[col])
+        
+    wide_columns = base_columns + crossed_columns
+    
+    train_df_temp = pd.get_dummies(train_df[categorical_columns],columns=categorical_columns)
+    test_df_temp = pd.get_dummies(test_df[categorical_columns], columns=categorical_columns)
+    train_df = train_df.join(train_df_temp)
+    test_df = test_df.join(test_df_temp)
+    
+    deep_columns = list(train_df_temp.columns)+ ['age','education_num','capital_gain','capital_loss','hours_per_week']
+    
+    train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
+    test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
+    
+    with io.open('train_data/columns.txt','w') as f:
+        write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
+        f.write(write_str)
+        f.close()
+    with io.open('test_data/columns.txt','w') as f:
+        write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
+        f.write(write_str)
+        f.close()
+    
+    train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(train_data_path,index=False)
+    test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(test_data_path,index=False)
+
+
+def clean_file(train_path, test_path, train_data_path, test_data_path):
+    _clean_file(train_path, train_data_path)
+    _clean_file(test_path, test_data_path)
+
+if __name__ == '__main__':
+    args = args.parse_args()
+    clean_file(args.train_path, args.test_path, args.train_data_path, args.test_data_path)  
+    build_model_columns(args.train_data_path, args.test_data_path)
--- a/models/rank/wide_deep/data/get_slot_data.py
+++ b/models/rank/wide_deep/data/get_slot_data.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import yaml
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+import paddle.fluid.incubate.data_generator as dg
+
+class TrainReader(dg.MultiSlotDataGenerator):
+    def __init__(self, config):
+        dg.MultiSlotDataGenerator.__init__(self)
+
+        if os.path.isfile(config):
+            with open(config, 'r') as rb:
+                _config = yaml.load(rb.read(), Loader=yaml.FullLoader)
+        else:
+            raise ValueError("reader config only support yaml")
+
+    def init(self):
+        pass
+
+    def _process_line(self, line):
+        line = line.strip().split(',')
+        features = list(map(float, line))
+        wide_feat = features[0:8]
+        deep_feat = features[8:58+8]
+        label = int(features[-1])
+        return wide_feat, deep_feat, [label]
+    
+    def generate_sample(self, line):
+        """
+        Read the data line by line and process it as a dictionary
+        """
+        def data_iter():
+            wide_feat, deep_deat, label = self._process_line(line)
+
+            s = ""
+            for i in [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)]:
+                k = i[0]
+                v = i[1]
+                for j in v:
+                    s += " " + k + ":" + str(j)
+            print s.strip()
+            yield None
+
+        return data_iter
+
+reader = TrainReader("../config.yaml")
+reader.init()
+reader.run_from_stdin()
--- a/models/rank/wide_deep/data/run.sh
+++ b/models/rank/wide_deep/data/run.sh
+sh create_data.sh
+
+mkdir slot_train_data
+for i in `ls ./train_data`
+do
+    cat train_data/$i | python get_slot_data.py > slot_train_data/$i
+done
+
+mkdir slot_test_data
+for i in `ls ./test_data`
+do
+    cat test_data/$i | python get_slot_data.py > slot_test_data/$i
+done
--- a/models/rank/xdeepfm/config.yaml
+++ b/models/rank/xdeepfm/config.yaml
@@ -22,7 +22,7 @@ train:

  reader:
    batch_size: 2
-    train_data_path: "{workspace}/slot_data/train_data"
+    train_data_path: "{workspace}/data/slot_train_data"
    sparse_slots: "label feat_idx"
    dense_slots: "feat_value:39"


--- a/models/rank/xdeepfm/data/get_slot_data.py
+++ b/models/rank/xdeepfm/data/get_slot_data.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import yaml
+from paddlerec.core.reader import Reader
+from paddlerec.core.utils import envs
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+import paddle.fluid.incubate.data_generator as dg
+
+class TrainReader(dg.MultiSlotDataGenerator):
+    def __init__(self, config):
+        dg.MultiSlotDataGenerator.__init__(self)
+        if os.path.isfile(config):
+            with open(config, 'r') as rb:
+                _config = yaml.load(rb.read(), Loader=yaml.FullLoader)
+        else:
+            raise ValueError("reader config only support yaml")
+
+    def init(self):
+        pass
+    
+    def _process_line(self, line):
+        features = line.strip('\n').split('\t')
+        feat_idx = []
+        feat_value = []
+        for idx in range(1, 40):
+            feat_idx.append(int(features[idx]))
+            feat_value.append(1.0)
+        label = [int(features[0])]
+        return feat_idx, feat_value, label
+    
+    def generate_sample(self, line):
+        def data_iter():
+            feat_idx, feat_value, label = self._process_line(line)
+
+            s = ""
+            for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]:
+                k = i[0]
+                v = i[1]
+                for j in v:
+                    s += " " + k + ":" + str(j)
+            print s.strip()
+            yield None
+
+        return data_iter
+
+reader = TrainReader("../config.yaml")
+reader.init()
+reader.run_from_stdin()
--- a/models/rank/xdeepfm/data/run.sh
+++ b/models/rank/xdeepfm/data/run.sh
+python download.py
+
+mkdir -p slot_train_data/tr
+for i in `ls ./train_data/tr`
+do
+    cat train_data/tr/$i | python get_slot_data.py > slot_train_data/tr/$i
+done
+
+mkdir slot_test_data/ev
+for i in `ls ./test_data/ev`
+do
+    cat test_data/ev/$i | python get_slot_data.py > slot_test_data/ev/$i
+done