提交 58923013 编写于 作者: X xujiaqi01

fix

上级 c3b28513
...@@ -22,7 +22,7 @@ train: ...@@ -22,7 +22,7 @@ train:
reader: reader:
batch_size: 2 batch_size: 2
train_data_path: "{workspace}/slot_data/train" train_data_path: "{workspace}/data/slot_train"
feat_dict_name: "{workspace}/data/vocab" feat_dict_name: "{workspace}/data/vocab"
sparse_slots: "label C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26" sparse_slots: "label C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26"
dense_slots: "I1:1 I2:1 I3:1 I4:1 I5:1 I6:1 I7:1 I8:1 I9:1 I10:1 I11:1 I12:1 I13:1" dense_slots: "I1:1 I2:1 I3:1 I4:1 I5:1 I6:1 I7:1 I8:1 I9:1 I10:1 I11:1 I12:1 I13:1"
...@@ -35,7 +35,7 @@ train: ...@@ -35,7 +35,7 @@ train:
l2_reg_cross: 0.00005 l2_reg_cross: 0.00005
dnn_use_bn: False dnn_use_bn: False
clip_by_norm: 100.0 clip_by_norm: 100.0
cat_feat_num: "{workspace}/slot_data/cat_feature_num.txt" cat_feat_num: "{workspace}/data/cat_feature_num.txt"
is_sparse: False is_sparse: False
is_test: False is_test: False
num_field: 39 num_field: 39
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import sys
import yaml
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
import cPickle as pickle
except ImportError:
import pickle
from collections import Counter
import os
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
def init(self):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [
5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 11,
231, 4008, 7393
]
self.cont_diff_ = [
self.cont_max_[i] - self.cont_min_[i]
for i in range(len(self.cont_min_))
]
self.cont_idx_ = list(range(1, 14))
self.cat_idx_ = list(range(14, 40))
dense_feat_names = ['I' + str(i) for i in range(1, 14)]
sparse_feat_names = ['C' + str(i) for i in range(1, 27)]
target = ['label']
self.label_feat_names = target + dense_feat_names + sparse_feat_names
self.cat_feat_idx_dict_list = [{} for _ in range(26)]
# TODO: set vocabulary dictionary
vocab_dir = "./vocab/"
for i in range(26):
lookup_idx = 1 # remain 0 for default value
for line in open(
os.path.join(vocab_dir, 'C' + str(i + 1) + '.txt')):
self.cat_feat_idx_dict_list[i][line.strip()] = lookup_idx
lookup_idx += 1
def _process_line(self, line):
features = line.rstrip('\n').split('\t')
label_feat_list = [[] for _ in range(40)]
for idx in self.cont_idx_:
if features[idx] == '':
label_feat_list[idx].append(0)
else:
# 0-1 minmax norm
# label_feat_list[idx].append((float(features[idx]) - self.cont_min_[idx - 1]) /
# self.cont_diff_[idx - 1])
# log transform
label_feat_list[idx].append(
math.log(4 + float(features[idx]))
if idx == 2 else math.log(1 + float(features[idx])))
for idx in self.cat_idx_:
if features[idx] == '' or features[
idx] not in self.cat_feat_idx_dict_list[idx - 14]:
label_feat_list[idx].append(0)
else:
label_feat_list[idx].append(self.cat_feat_idx_dict_list[
idx - 14][features[idx]])
label_feat_list[0].append(int(features[0]))
return label_feat_list
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
label_feat_list = self._process_line(line)
s = ""
for i in list(zip(self.label_feat_names, label_feat_list)):
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
python download.py
python preprocess.py
mkdir slot_train
for i in `ls ./train`
do
cat train/$i | python get_slot_data.py > slot_train/$i
done
mkdir slot_test_valid
for i in `ls ./test_valid`
do
cat test_valid/$i | python get_slot_data.py > slot_test_valid/$i
done
...@@ -22,8 +22,8 @@ train: ...@@ -22,8 +22,8 @@ train:
reader: reader:
batch_size: 2 batch_size: 2
train_data_path: "{workspace}/slot_data/train_data" train_data_path: "{workspace}/data/slot_train_data"
feat_dict_name: "{workspace}/slot_data/feat_dict_10.pkl2" feat_dict_name: "{workspace}/data/feat_dict_10.pkl2"
sparse_slots: "label feat_idx" sparse_slots: "label feat_idx"
dense_slots: "feat_value:39" dense_slots: "feat_value:39"
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import yaml
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
import cPickle as pickle
except ImportError:
import pickle
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
def init(self):
self.cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.cont_max_ = [
5775, 257675, 65535, 969, 23159456, 431037, 56311, 6047, 29019, 46,
231, 4008, 7393
]
self.cont_diff_ = [
self.cont_max_[i] - self.cont_min_[i]
for i in range(len(self.cont_min_))
]
self.continuous_range_ = range(1, 14)
self.categorical_range_ = range(14, 40)
# load preprocessed feature dict
self.feat_dict_name = "aid_data/feat_dict_10.pkl2"
self.feat_dict_ = pickle.load(open(self.feat_dict_name, 'rb'))
def _process_line(self, line):
features = line.rstrip('\n').split('\t')
feat_idx = []
feat_value = []
for idx in self.continuous_range_:
if features[idx] == '':
feat_idx.append(0)
feat_value.append(0.0)
else:
feat_idx.append(self.feat_dict_[idx])
feat_value.append(
(float(features[idx]) - self.cont_min_[idx - 1]) /
self.cont_diff_[idx - 1])
for idx in self.categorical_range_:
if features[idx] == '' or features[idx] not in self.feat_dict_:
feat_idx.append(0)
feat_value.append(0.0)
else:
feat_idx.append(self.feat_dict_[features[idx]])
feat_value.append(1.0)
label = [int(features[0])]
return feat_idx, feat_value, label
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
feat_idx, feat_value, label = self._process_line(line)
s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]:
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
python download_preprocess.py
mkdir slot_train_data
for i in `ls ./train_data`
do
cat train_data/$i | python get_slot_data.py > slot_train_data/$i
done
mkdir slot_test_data
for i in `ls ./test_data`
do
cat test_data/$i | python get_slot_data.py > slot_test_data/$i
done
...@@ -22,7 +22,7 @@ train: ...@@ -22,7 +22,7 @@ train:
reader: reader:
batch_size: 2 batch_size: 2
train_data_path: "{workspace}/slot_data/train" train_data_path: "{workspace}/data/slot_train_data"
reader_debug_mode: False reader_debug_mode: False
sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
dense_slots: "dense_var:13" dense_slots: "dense_var:13"
......
wget --no-check-certificate https://fleet.bj.bcebos.com/ctr_data.tar.gz
tar -zxvf ctr_data.tar.gz
mv ./raw_data ./train_data_full
mkdir train_data && cd train_data
cp ../train_data_full/part-0 ../train_data_full/part-1 ./ && cd ..
mv ./test_data ./test_data_full
mkdir test_data && cd test_data
cp ../test_data_full/part-220 ./ && cd ..
echo "Complete data download."
echo "Full Train data stored in ./train_data_full "
echo "Full Test data stored in ./test_data_full "
echo "Rapid Verification train data stored in ./train_data "
echo "Rapid Verification test data stored in ./test_data "
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.incubate.data_generator as dg
cont_min_ = [0, -3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
cont_max_ = [20, 600, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
cont_diff_ = [20, 603, 100, 50, 64000, 500, 100, 50, 500, 10, 10, 10, 50]
hash_dim_ = 1000001
continuous_range_ = range(1, 14)
categorical_range_ = range(14, 40)
class CriteoDataset(dg.MultiSlotDataGenerator):
"""
DacDataset: inheritance MultiSlotDataGeneratior, Implement data reading
Help document: http://wiki.baidu.com/pages/viewpage.action?pageId=728820675
"""
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""
def reader():
"""
This function needs to be implemented by the user, based on data format
"""
features = line.rstrip('\n').split('\t')
dense_feature = []
sparse_feature = []
for idx in continuous_range_:
if features[idx] == "":
dense_feature.append(0.0)
else:
dense_feature.append(
(float(features[idx]) - cont_min_[idx - 1]) /
cont_diff_[idx - 1])
for idx in categorical_range_:
sparse_feature.append(
[hash(str(idx) + features[idx]) % hash_dim_])
label = [int(features[0])]
process_line = dense_feature, sparse_feature, label
feature_name = ["dense_feature"]
for idx in categorical_range_:
feature_name.append("C" + str(idx - 13))
feature_name.append("label")
s = "click:" + str(label[0])
for i in dense_feature:
s += " dense_feature:" + str(i)
for i in range(1, 1 + len(categorical_range_)):
s += " " + str(i) + ":" + str(sparse_feature[i-1][0])
print s.strip()
yield None
return reader
d = CriteoDataset()
d.run_from_stdin()
sh download.sh
mkdir slot_train_data_full
for i in `ls ./train_data_full`
do
cat train_data_full/$i | python get_slot_data.py > slot_train_data_full/$i
done
mkdir slot_test_data_full
for i in `ls ./test_data_full`
do
cat test_data_full/$i | python get_slot_data.py > slot_test_data_full/$i
done
mkdir slot_train_data
for i in `ls ./train_data`
do
cat train_data/$i | python get_slot_data.py > slot_train_data/$i
done
mkdir slot_test_data
for i in `ls ./test_data`
do
cat test_data/$i | python get_slot_data.py > slot_test_data/$i
done
此差异已折叠。
此差异已折叠。
...@@ -22,7 +22,7 @@ train: ...@@ -22,7 +22,7 @@ train:
reader: reader:
batch_size: 2 batch_size: 2
train_data_path: "{workspace}/slot_data/train_data" train_data_path: "{workspace}/data/slot_train_data"
sparse_slots: "label" sparse_slots: "label"
dense_slots: "wide_input:8 deep_input:58" dense_slots: "wide_input:8 deep_input:58"
......
mkdir train_data mkdir train_data
mkdir test_data mkdir test_data
mkdir data train_path="adult.data"
train_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/data/adult.data" test_path="adult.test"
test_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/data/adult.test" train_data_path="./train_data/train_data.csv"
train_data_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/train_data/train_data.csv" test_data_path="./test_data/test_data.csv"
test_data_path="/home/yaoxuefeng/repos/models/models/PaddleRec/ctr/wide_deep/test_data/test_data.csv"
#pip install -r requirements.txt pip install -r requirements.txt
#wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
#wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test wget -P data/ https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
python data_preparation.py --train_path ${train_path} \ python data_preparation.py --train_path ${train_path} \
--test_path ${test_path} \ --test_path ${test_path} \
......
import os
import io
import args
import pandas as pd
from sklearn import preprocessing
def _clean_file(source_path,target_path):
"""makes changes to match the CSV format."""
with io.open(source_path, 'r') as temp_eval_file:
with io.open(target_path, 'w') as eval_file:
for line in temp_eval_file:
line = line.strip()
line = line.replace(', ', ',')
if not line or ',' not in line:
continue
if line[-1] == '.':
line = line[:-1]
line += '\n'
eval_file.write(line)
def build_model_columns(train_data_path, test_data_path):
# The column names are from
# https://www2.1010data.com/documentationcenter/prod/Tutorials/MachineLearningExamples/CensusIncomeDataSet.html
column_names = [
'age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'gender',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income_bracket'
]
# Load the dataset in Pandas
train_df = pd.read_csv(
train_data_path,
delimiter=',',
header=None,
index_col=None,
names=column_names)
test_df = pd.read_csv(
test_data_path,
delimiter=',',
header=None,
index_col=None,
names=column_names)
# First group of tasks according to the paper
#label_columns = ['income_50k', 'marital_stat']
categorical_columns = ['education','marital_status','relationship','workclass','occupation']
for col in categorical_columns:
label_train = preprocessing.LabelEncoder()
train_df[col]= label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col]= label_test.fit_transform(test_df[col])
bins = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
train_df['age_buckets'] = pd.cut(train_df['age'].values.tolist(), bins,labels=False)
test_df['age_buckets'] = pd.cut(test_df['age'].values.tolist(), bins,labels=False)
base_columns = ['education', 'marital_status', 'relationship', 'workclass', 'occupation', 'age_buckets']
train_df['education_occupation'] = train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str)
test_df['education_occupation'] = test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str)
train_df['age_buckets_education_occupation'] = train_df['age_buckets'].astype(str) + '_' + train_df['education'].astype(str) + '_' + train_df['occupation'].astype(str)
test_df['age_buckets_education_occupation'] = test_df['age_buckets'].astype(str) + '_' + test_df['education'].astype(str) + '_' + test_df['occupation'].astype(str)
crossed_columns = ['education_occupation','age_buckets_education_occupation']
for col in crossed_columns:
label_train = preprocessing.LabelEncoder()
train_df[col]= label_train.fit_transform(train_df[col])
label_test = preprocessing.LabelEncoder()
test_df[col]= label_test.fit_transform(test_df[col])
wide_columns = base_columns + crossed_columns
train_df_temp = pd.get_dummies(train_df[categorical_columns],columns=categorical_columns)
test_df_temp = pd.get_dummies(test_df[categorical_columns], columns=categorical_columns)
train_df = train_df.join(train_df_temp)
test_df = test_df.join(test_df_temp)
deep_columns = list(train_df_temp.columns)+ ['age','education_num','capital_gain','capital_loss','hours_per_week']
train_df['label'] = train_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
test_df['label'] = test_df['income_bracket'].apply(lambda x : 1 if x == '>50K' else 0)
with io.open('train_data/columns.txt','w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
f.write(write_str)
f.close()
with io.open('test_data/columns.txt','w') as f:
write_str = str(len(wide_columns)) + '\n' + str(len(deep_columns)) + '\n'
f.write(write_str)
f.close()
train_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(train_data_path,index=False)
test_df[wide_columns + deep_columns + ['label']].fillna(0).to_csv(test_data_path,index=False)
def clean_file(train_path, test_path, train_data_path, test_data_path):
_clean_file(train_path, train_data_path)
_clean_file(test_path, test_data_path)
if __name__ == '__main__':
args = args.parse_args()
clean_file(args.train_path, args.test_path, args.train_data_path, args.test_data_path)
build_model_columns(args.train_data_path, args.test_data_path)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import yaml
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
import cPickle as pickle
except ImportError:
import pickle
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
def init(self):
pass
def _process_line(self, line):
line = line.strip().split(',')
features = list(map(float, line))
wide_feat = features[0:8]
deep_feat = features[8:58+8]
label = int(features[-1])
return wide_feat, deep_feat, [label]
def generate_sample(self, line):
"""
Read the data line by line and process it as a dictionary
"""
def data_iter():
wide_feat, deep_deat, label = self._process_line(line)
s = ""
for i in [('wide_input', wide_feat), ('deep_input', deep_deat), ('label', label)]:
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
sh create_data.sh
mkdir slot_train_data
for i in `ls ./train_data`
do
cat train_data/$i | python get_slot_data.py > slot_train_data/$i
done
mkdir slot_test_data
for i in `ls ./test_data`
do
cat test_data/$i | python get_slot_data.py > slot_test_data/$i
done
...@@ -22,7 +22,7 @@ train: ...@@ -22,7 +22,7 @@ train:
reader: reader:
batch_size: 2 batch_size: 2
train_data_path: "{workspace}/slot_data/train_data" train_data_path: "{workspace}/data/slot_train_data"
sparse_slots: "label feat_idx" sparse_slots: "label feat_idx"
dense_slots: "feat_value:39" dense_slots: "feat_value:39"
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import yaml
from paddlerec.core.reader import Reader
from paddlerec.core.utils import envs
try:
import cPickle as pickle
except ImportError:
import pickle
import paddle.fluid.incubate.data_generator as dg
class TrainReader(dg.MultiSlotDataGenerator):
def __init__(self, config):
dg.MultiSlotDataGenerator.__init__(self)
if os.path.isfile(config):
with open(config, 'r') as rb:
_config = yaml.load(rb.read(), Loader=yaml.FullLoader)
else:
raise ValueError("reader config only support yaml")
def init(self):
pass
def _process_line(self, line):
features = line.strip('\n').split('\t')
feat_idx = []
feat_value = []
for idx in range(1, 40):
feat_idx.append(int(features[idx]))
feat_value.append(1.0)
label = [int(features[0])]
return feat_idx, feat_value, label
def generate_sample(self, line):
def data_iter():
feat_idx, feat_value, label = self._process_line(line)
s = ""
for i in [('feat_idx', feat_idx), ('feat_value', feat_value), ('label', label)]:
k = i[0]
v = i[1]
for j in v:
s += " " + k + ":" + str(j)
print s.strip()
yield None
return data_iter
reader = TrainReader("../config.yaml")
reader.init()
reader.run_from_stdin()
python download.py
mkdir -p slot_train_data/tr
for i in `ls ./train_data/tr`
do
cat train_data/tr/$i | python get_slot_data.py > slot_train_data/tr/$i
done
mkdir slot_test_data/ev
for i in `ls ./test_data/ev`
do
cat test_data/ev/$i | python get_slot_data.py > slot_test_data/ev/$i
done
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册