未验证 提交 5f187850 编写于 作者: Z zhang wenhui 提交者: GitHub

Update2.0 model (#4905)

* update api 1.8

* fix paddlerec readme

* update 20 , test=develop
上级 3fad507e
[156, 51, 24, 103, 195, 35, 188, 16, 224, 173, 116, 3, 226, 11, 64, 94, 6, 70, 197, 164, 220, 77, 172, 194, 227, 12, 65, 129, 39, 38, 75, 210, 215, 36, 46, 185, 76, 222, 108, 78, 120, 71, 33, 189, 135, 97, 90, 219, 105, 205, 136, 167, 106, 29, 157, 125, 217, 121, 175, 143, 200, 45, 179, 37, 86, 140, 225, 47, 20, 228, 4, 209, 177, 178, 171, 58, 48, 118, 9, 149, 55, 192, 82, 17, 43, 54, 93, 96, 159, 216, 18, 206, 223, 104, 132, 182, 60, 109, 28, 180, 44, 166, 128, 27, 163, 141, 229, 102, 150, 7, 83, 198, 41, 191, 114, 117, 122, 161, 130, 174, 176, 160, 201, 49, 112, 69, 165, 95, 133, 92, 59, 110, 151, 203, 67, 169, 21, 66, 80, 22, 23, 152, 40, 127, 111, 186, 72, 26, 190, 42, 0, 63, 53, 124, 137, 85, 126, 196, 187, 208, 98, 25, 15, 170, 193, 168, 202, 31, 146, 147, 113, 32, 204, 131, 68, 84, 213, 19, 81, 79, 162, 199, 107, 50, 2, 207, 10, 181, 144, 139, 134, 62, 155, 142, 214, 212, 61, 52, 101, 99, 158, 145, 13, 153, 56, 184, 221]
\ No newline at end of file
import os
import shutil
import sys
LOCAL_PATH = os.path.dirname(os.path.abspath(__file__))
TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools")
sys.path.append(TOOLS_PATH)
from tools import download_file_and_uncompress, download_file
if __name__ == '__main__':
url = "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz"
url2 = "https://paddlerec.bj.bcebos.com/deepfm%2Ffeat_dict_10.pkl2"
print("download and extract starting...")
download_file_and_uncompress(url)
if not os.path.exists("aid_data"):
os.makedirs("aid_data")
download_file(url2, "./aid_data/feat_dict_10.pkl2", True)
print("download and extract finished")
print("preprocessing...")
os.system("python preprocess.py")
print("preprocess done")
shutil.rmtree("raw_data")
print("done")
from __future__ import division
import os
import numpy
from collections import Counter
import shutil
import pickle
def get_raw_data(intput_file, raw_data, ins_per_file):
if not os.path.isdir(raw_data):
os.mkdir(raw_data)
fin = open(intput_file, 'r')
fout = open(os.path.join(raw_data, 'part-0'), 'w')
for line_idx, line in enumerate(fin):
if line_idx % ins_per_file == 0 and line_idx != 0:
fout.close()
cur_part_idx = int(line_idx / ins_per_file)
fout = open(
os.path.join(raw_data, 'part-' + str(cur_part_idx)), 'w')
fout.write(line)
fout.close()
fin.close()
def split_data(raw_data, aid_data, train_data, test_data):
split_rate_ = 0.9
dir_train_file_idx_ = os.path.join(aid_data, 'train_file_idx.txt')
filelist_ = [
os.path.join(raw_data, 'part-%d' % x)
for x in range(len(os.listdir(raw_data)))
]
if not os.path.exists(dir_train_file_idx_):
train_file_idx = list(
numpy.random.choice(
len(filelist_), int(len(filelist_) * split_rate_), False))
with open(dir_train_file_idx_, 'w') as fout:
fout.write(str(train_file_idx))
else:
with open(dir_train_file_idx_, 'r') as fin:
train_file_idx = eval(fin.read())
for idx in range(len(filelist_)):
if idx in train_file_idx:
shutil.move(filelist_[idx], train_data)
else:
shutil.move(filelist_[idx], test_data)
def get_feat_dict(intput_file, aid_data, print_freq=100000, total_ins=45000000):
freq_ = 10
dir_feat_dict_ = os.path.join(aid_data, 'feat_dict_' + str(freq_) + '.pkl2')
continuous_range_ = range(1, 14)
categorical_range_ = range(14, 40)
if not os.path.exists(dir_feat_dict_):
# print('generate a feature dict')
# Count the number of occurrences of discrete features
feat_cnt = Counter()
with open(intput_file, 'r') as fin:
for line_idx, line in enumerate(fin):
if line_idx % print_freq == 0:
print(r'generating feature dict {:.2f} %'.format((
line_idx / total_ins) * 100))
features = line.rstrip('\n').split('\t')
for idx in categorical_range_:
if features[idx] == '': continue
feat_cnt.update([features[idx]])
# Only retain discrete features with high frequency
dis_feat_set = set()
for feat, ot in feat_cnt.items():
if ot >= freq_:
dis_feat_set.add(feat)
# Create a dictionary for continuous and discrete features
feat_dict = {}
tc = 1
# Continuous features
for idx in continuous_range_:
feat_dict[idx] = tc
tc += 1
for feat in dis_feat_set:
feat_dict[feat] = tc
tc += 1
# Save dictionary
with open(dir_feat_dict_, 'wb') as fout:
pickle.dump(feat_dict, fout, protocol=2)
print('args.num_feat ', len(feat_dict) + 1)
def preprocess(input_file,
outdir,
ins_per_file,
total_ins=None,
print_freq=None):
train_data = os.path.join(outdir, "train_data")
test_data = os.path.join(outdir, "test_data")
aid_data = os.path.join(outdir, "aid_data")
raw_data = os.path.join(outdir, "raw_data")
if not os.path.isdir(train_data):
os.mkdir(train_data)
if not os.path.isdir(test_data):
os.mkdir(test_data)
if not os.path.isdir(aid_data):
os.mkdir(aid_data)
if print_freq is None:
print_freq = 10 * ins_per_file
get_raw_data(input_file, raw_data, ins_per_file)
split_data(raw_data, aid_data, train_data, test_data)
get_feat_dict(input_file, aid_data, print_freq, total_ins)
print('Done!')
if __name__ == '__main__':
preprocess('train.txt', './', 200000, 45000000)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册