未验证 提交 5f187850 编写于 作者: Z zhang wenhui 提交者: GitHub

Update2.0 model (#4905)

* update api 1.8

* fix paddlerec readme

* update 20 , test=develop
上级 3fad507e
[156, 51, 24, 103, 195, 35, 188, 16, 224, 173, 116, 3, 226, 11, 64, 94, 6, 70, 197, 164, 220, 77, 172, 194, 227, 12, 65, 129, 39, 38, 75, 210, 215, 36, 46, 185, 76, 222, 108, 78, 120, 71, 33, 189, 135, 97, 90, 219, 105, 205, 136, 167, 106, 29, 157, 125, 217, 121, 175, 143, 200, 45, 179, 37, 86, 140, 225, 47, 20, 228, 4, 209, 177, 178, 171, 58, 48, 118, 9, 149, 55, 192, 82, 17, 43, 54, 93, 96, 159, 216, 18, 206, 223, 104, 132, 182, 60, 109, 28, 180, 44, 166, 128, 27, 163, 141, 229, 102, 150, 7, 83, 198, 41, 191, 114, 117, 122, 161, 130, 174, 176, 160, 201, 49, 112, 69, 165, 95, 133, 92, 59, 110, 151, 203, 67, 169, 21, 66, 80, 22, 23, 152, 40, 127, 111, 186, 72, 26, 190, 42, 0, 63, 53, 124, 137, 85, 126, 196, 187, 208, 98, 25, 15, 170, 193, 168, 202, 31, 146, 147, 113, 32, 204, 131, 68, 84, 213, 19, 81, 79, 162, 199, 107, 50, 2, 207, 10, 181, 144, 139, 134, 62, 155, 142, 214, 212, 61, 52, 101, 99, 158, 145, 13, 153, 56, 184, 221]
\ No newline at end of file
import os
import shutil
import sys
LOCAL_PATH = os.path.dirname(os.path.abspath(__file__))
TOOLS_PATH = os.path.join(LOCAL_PATH, "..", "..", "tools")
sys.path.append(TOOLS_PATH)
from tools import download_file_and_uncompress, download_file
if __name__ == '__main__':
url = "https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz"
url2 = "https://paddlerec.bj.bcebos.com/deepfm%2Ffeat_dict_10.pkl2"
print("download and extract starting...")
download_file_and_uncompress(url)
if not os.path.exists("aid_data"):
os.makedirs("aid_data")
download_file(url2, "./aid_data/feat_dict_10.pkl2", True)
print("download and extract finished")
print("preprocessing...")
os.system("python preprocess.py")
print("preprocess done")
shutil.rmtree("raw_data")
print("done")
from __future__ import division
import os
import numpy
from collections import Counter
import shutil
import pickle
def get_raw_data(intput_file, raw_data, ins_per_file):
if not os.path.isdir(raw_data):
os.mkdir(raw_data)
fin = open(intput_file, 'r')
fout = open(os.path.join(raw_data, 'part-0'), 'w')
for line_idx, line in enumerate(fin):
if line_idx % ins_per_file == 0 and line_idx != 0:
fout.close()
cur_part_idx = int(line_idx / ins_per_file)
fout = open(
os.path.join(raw_data, 'part-' + str(cur_part_idx)), 'w')
fout.write(line)
fout.close()
fin.close()
def split_data(raw_data, aid_data, train_data, test_data):
split_rate_ = 0.9
dir_train_file_idx_ = os.path.join(aid_data, 'train_file_idx.txt')
filelist_ = [
os.path.join(raw_data, 'part-%d' % x)
for x in range(len(os.listdir(raw_data)))
]
if not os.path.exists(dir_train_file_idx_):
train_file_idx = list(
numpy.random.choice(
len(filelist_), int(len(filelist_) * split_rate_), False))
with open(dir_train_file_idx_, 'w') as fout:
fout.write(str(train_file_idx))
else:
with open(dir_train_file_idx_, 'r') as fin:
train_file_idx = eval(fin.read())
for idx in range(len(filelist_)):
if idx in train_file_idx:
shutil.move(filelist_[idx], train_data)
else:
shutil.move(filelist_[idx], test_data)
def get_feat_dict(intput_file, aid_data, print_freq=100000, total_ins=45000000):
freq_ = 10
dir_feat_dict_ = os.path.join(aid_data, 'feat_dict_' + str(freq_) + '.pkl2')
continuous_range_ = range(1, 14)
categorical_range_ = range(14, 40)
if not os.path.exists(dir_feat_dict_):
# print('generate a feature dict')
# Count the number of occurrences of discrete features
feat_cnt = Counter()
with open(intput_file, 'r') as fin:
for line_idx, line in enumerate(fin):
if line_idx % print_freq == 0:
print(r'generating feature dict {:.2f} %'.format((
line_idx / total_ins) * 100))
features = line.rstrip('\n').split('\t')
for idx in categorical_range_:
if features[idx] == '': continue
feat_cnt.update([features[idx]])
# Only retain discrete features with high frequency
dis_feat_set = set()
for feat, ot in feat_cnt.items():
if ot >= freq_:
dis_feat_set.add(feat)
# Create a dictionary for continuous and discrete features
feat_dict = {}
tc = 1
# Continuous features
for idx in continuous_range_:
feat_dict[idx] = tc
tc += 1
for feat in dis_feat_set:
feat_dict[feat] = tc
tc += 1
# Save dictionary
with open(dir_feat_dict_, 'wb') as fout:
pickle.dump(feat_dict, fout, protocol=2)
print('args.num_feat ', len(feat_dict) + 1)
def preprocess(input_file,
outdir,
ins_per_file,
total_ins=None,
print_freq=None):
train_data = os.path.join(outdir, "train_data")
test_data = os.path.join(outdir, "test_data")
aid_data = os.path.join(outdir, "aid_data")
raw_data = os.path.join(outdir, "raw_data")
if not os.path.isdir(train_data):
os.mkdir(train_data)
if not os.path.isdir(test_data):
os.mkdir(test_data)
if not os.path.isdir(aid_data):
os.mkdir(aid_data)
if print_freq is None:
print_freq = 10 * ins_per_file
get_raw_data(input_file, raw_data, ins_per_file)
split_data(raw_data, aid_data, train_data, test_data)
get_feat_dict(input_file, aid_data, print_freq, total_ins)
print('Done!')
if __name__ == '__main__':
preprocess('train.txt', './', 200000, 45000000)
......@@ -16,20 +16,13 @@ from __future__ import print_function
import os
import unittest
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.dygraph.nn import Embedding
import paddle.fluid.framework as framework
from paddle.fluid.optimizer import SGDOptimizer
from paddle.fluid.optimizer import AdagradOptimizer
from paddle.fluid.dygraph.base import to_variable
import paddle
import numpy as np
import six
import reader
import model_check
import time
from args import *
import sys
......@@ -38,7 +31,7 @@ if sys.version[0] == '2':
sys.setdefaultencoding("utf-8")
class SimpleGRURNN(fluid.Layer):
class SimpleGRURNN(paddle.fluid.Layer):
def __init__(self,
hidden_size,
num_steps,
......@@ -61,47 +54,42 @@ class SimpleGRURNN(fluid.Layer):
for i in range(self._num_layers):
weight_1 = self.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 2, self._hidden_size * 2],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
default_initializer=paddle.nn.initializer.Uniform(
low=-self._init_scale, high=self._init_scale))
self.weight_1_arr.append(self.add_parameter('w1_%d' % i, weight_1))
weight_2 = self.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size, self._hidden_size],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
default_initializer=paddle.nn.initializer.Uniform(
low=-self._init_scale, high=self._init_scale))
self.weight_2_arr.append(self.add_parameter('w2_%d' % i, weight_2))
weight_3 = self.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size, self._hidden_size],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
default_initializer=paddle.nn.initializer.Uniform(
low=-self._init_scale, high=self._init_scale))
self.weight_3_arr.append(self.add_parameter('w3_%d' % i, weight_3))
bias_1 = self.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 2],
dtype="float32",
default_initializer=fluid.initializer.Constant(0.0))
default_initializer=paddle.nn.initializer.Constant(0.0))
self.bias_1_arr.append(self.add_parameter('b1_%d' % i, bias_1))
bias_2 = self.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Uniform(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 1],
dtype="float32",
default_initializer=fluid.initializer.Constant(0.0))
default_initializer=paddle.nn.initializer.Constant(0.0))
self.bias_2_arr.append(self.add_parameter('b2_%d' % i, bias_2))
def forward(self, input_embedding, init_hidden=None):
......@@ -121,39 +109,38 @@ class SimpleGRURNN(fluid.Layer):
bias_1 = self.bias_1_arr[k]
bias_2 = self.bias_2_arr[k]
nn = fluid.layers.concat([step_input, pre_hidden], 1)
gate_input = fluid.layers.matmul(x=nn, y=weight_1)
gate_input = fluid.layers.elementwise_add(gate_input, bias_1)
u, r = fluid.layers.split(gate_input, num_or_sections=2, dim=-1)
hidden_c = fluid.layers.tanh(
fluid.layers.elementwise_add(
fluid.layers.matmul(
x=step_input, y=weight_2) + fluid.layers.matmul(
x=(fluid.layers.sigmoid(r) * pre_hidden),
nn = paddle.concat(x=[step_input, pre_hidden], axis=1)
gate_input = paddle.matmul(x=nn, y=weight_1)
gate_input = paddle.add(x=gate_input, y=bias_1)
u, r = paddle.split(x=gate_input, num_or_sections=2, axis=-1)
hidden_c = paddle.tanh(
paddle.add(x=paddle.matmul(
x=step_input, y=weight_2) + paddle.matmul(
x=(paddle.nn.functional.sigmoid(r) * pre_hidden),
y=weight_3),
bias_2))
hidden_state = fluid.layers.sigmoid(u) * pre_hidden + (
1.0 - fluid.layers.sigmoid(u)) * hidden_c
y=bias_2))
hidden_state = paddle.nn.functional.sigmoid(u) * pre_hidden + (
1.0 - paddle.nn.functional.sigmoid(u)) * hidden_c
hidden_array[k] = hidden_state
step_input = hidden_state
if self._dropout is not None and self._dropout > 0.0:
step_input = fluid.layers.dropout(
step_input = paddle.fluid.layers.dropout(
step_input,
dropout_prob=self._dropout,
dropout_implementation='upscale_in_train')
res.append(step_input)
real_res = fluid.layers.concat(res, 1)
real_res = fluid.layers.reshape(
real_res = paddle.concat(x=res, axis=1)
real_res = paddle.fluid.layers.reshape(
real_res, [-1, self._num_steps, self._hidden_size])
last_hidden = fluid.layers.concat(hidden_array, 1)
last_hidden = fluid.layers.reshape(
last_hidden = paddle.concat(x=hidden_array, axis=1)
last_hidden = paddle.fluid.layers.reshape(
last_hidden, shape=[-1, self._num_layers, self._hidden_size])
last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2])
last_hidden = paddle.transpose(x=last_hidden, perm=[1, 0, 2])
return real_res, last_hidden
class PtbModel(fluid.Layer):
class PtbModel(paddle.fluid.Layer):
def __init__(self,
name_scope,
hidden_size,
......@@ -177,26 +164,26 @@ class PtbModel(fluid.Layer):
num_layers=num_layers,
init_scale=init_scale,
dropout=dropout)
self.embedding = Embedding(
self.embedding = paddle.fluid.dygraph.nn.Embedding(
#self.full_name(),
size=[vocab_size, hidden_size],
dtype='float32',
is_sparse=False,
param_attr=fluid.ParamAttr(
param_attr=paddle.ParamAttr(
name='embedding_para',
initializer=fluid.initializer.UniformInitializer(
initializer=paddle.nn.initializer.Uniform(
low=-init_scale, high=init_scale)))
self.softmax_weight = self.create_parameter(
attr=fluid.ParamAttr(),
attr=paddle.ParamAttr(),
shape=[self.hidden_size, self.vocab_size],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
default_initializer=paddle.nn.initializer.Uniform(
low=-self.init_scale, high=self.init_scale))
self.softmax_bias = self.create_parameter(
attr=fluid.ParamAttr(),
attr=paddle.ParamAttr(),
shape=[self.vocab_size],
dtype="float32",
default_initializer=fluid.initializer.UniformInitializer(
default_initializer=paddle.nn.initializer.Uniform(
low=-self.init_scale, high=self.init_scale))
def build_once(self, input, label, init_hidden):
......@@ -204,30 +191,31 @@ class PtbModel(fluid.Layer):
def forward(self, input, label, init_hidden):
init_h = fluid.layers.reshape(
init_h = paddle.fluid.layers.reshape(
init_hidden, shape=[self.num_layers, -1, self.hidden_size])
x_emb = self.embedding(input)
x_emb = fluid.layers.reshape(
x_emb = paddle.fluid.layers.reshape(
x_emb, shape=[-1, self.num_steps, self.hidden_size])
if self.dropout is not None and self.dropout > 0.0:
x_emb = fluid.layers.dropout(
x_emb = paddle.fluid.layers.dropout(
x_emb,
dropout_prob=self.dropout,
dropout_implementation='upscale_in_train')
rnn_out, last_hidden = self.simple_gru_rnn(x_emb, init_h)
projection = fluid.layers.matmul(rnn_out, self.softmax_weight)
projection = fluid.layers.elementwise_add(projection, self.softmax_bias)
loss = fluid.layers.softmax_with_cross_entropy(
projection = paddle.matmul(x=rnn_out, y=self.softmax_weight)
projection = paddle.add(x=projection, y=self.softmax_bias)
loss = paddle.nn.functional.softmax_with_cross_entropy(
logits=projection, label=label, soft_label=False)
pre_2d = fluid.layers.reshape(projection, shape=[-1, self.vocab_size])
label_2d = fluid.layers.reshape(label, shape=[-1, 1])
acc = fluid.layers.accuracy(input=pre_2d, label=label_2d, k=20)
loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps])
loss = fluid.layers.reduce_mean(loss, dim=[0])
loss = fluid.layers.reduce_sum(loss)
pre_2d = paddle.fluid.layers.reshape(
projection, shape=[-1, self.vocab_size])
label_2d = paddle.fluid.layers.reshape(label, shape=[-1, 1])
acc = paddle.metric.accuracy(input=pre_2d, label=label_2d, k=20)
loss = paddle.fluid.layers.reshape(loss, shape=[-1, self.num_steps])
loss = paddle.reduce_mean(loss, dim=[0])
loss = paddle.reduce_sum(loss)
return loss, last_hidden, acc
......@@ -263,13 +251,13 @@ def train_ptb_lm():
print("model type not support")
return
with fluid.dygraph.guard(core.CUDAPlace(0)):
paddle.disable_static(paddle.fluid.core.CUDAPlace(0))
if args.ce:
print("ce mode")
seed = 33
np.random.seed(seed)
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
paddle.static.default_startup_program().random_seed = seed
paddle.static.default_main_program().random_seed = seed
max_epoch = 1
ptb_model = PtbModel(
"ptb_model",
......@@ -285,7 +273,7 @@ def train_ptb_lm():
print(args.init_from_pretrain_model)
raise Warning("The pretrained params do not exist.")
return
fluid.load_dygraph(args.init_from_pretrain_model)
paddle.fluid.load_dygraph(args.init_from_pretrain_model)
print("finish initing model from pretrained params from %s" %
(args.init_from_pretrain_model))
......@@ -309,15 +297,16 @@ def train_ptb_lm():
lr_arr = [base_learning_rate]
for i in range(1, max_epoch):
bd.append(total_batch_size * i)
new_lr = base_learning_rate * (lr_decay**
max(i + 1 - epoch_start_decay, 0.0))
new_lr = base_learning_rate * (lr_decay
**max(i + 1 - epoch_start_decay, 0.0))
lr_arr.append(new_lr)
grad_clip = fluid.clip.GradientClipByGlobalNorm(max_grad_norm)
sgd = AdagradOptimizer(
parameter_list=ptb_model.parameters(),
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr),
grad_clip = paddle.nn.ClipGradByGlobalNorm(max_grad_norm)
sgd = paddle.optimizer.Adagrad(
parameters=ptb_model.parameters(),
learning_rate=base_learning_rate,
#learning_rate=paddle.fluid.layers.piecewise_decay(
# boundaries=bd, values=lr_arr),
grad_clip=grad_clip)
print("parameters:--------------------------------")
......@@ -334,14 +323,17 @@ def train_ptb_lm():
model.eval()
train_data_iter = reader.get_data_iter(data, batch_size, num_steps)
init_hidden = to_variable(init_hidden_data)
init_hidden = paddle.to_tensor(
data=init_hidden_data, dtype=None, place=None, stop_gradient=True)
accum_num_recall = 0.0
for batch_id, batch in enumerate(train_data_iter):
x_data, y_data = batch
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, num_steps, 1))
x = to_variable(x_data)
y = to_variable(y_data)
x = paddle.to_tensor(
data=x_data, dtype=None, place=None, stop_gradient=True)
y = paddle.to_tensor(
data=y_data, dtype=None, place=None, stop_gradient=True)
dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden)
out_loss = dy_loss.numpy()
......@@ -371,15 +363,18 @@ def train_ptb_lm():
train_data_iter = reader.get_data_iter(train_data, batch_size,
num_steps)
init_hidden = to_variable(init_hidden_data)
init_hidden = paddle.to_tensor(
data=init_hidden_data, dtype=None, place=None, stop_gradient=True)
start_time = time.time()
for batch_id, batch in enumerate(train_data_iter):
x_data, y_data = batch
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, num_steps, 1))
x = to_variable(x_data)
y = to_variable(y_data)
x = paddle.to_tensor(
data=x_data, dtype=None, place=None, stop_gradient=True)
y = paddle.to_tensor(
data=y_data, dtype=None, place=None, stop_gradient=True)
dy_loss, last_hidden, acc = ptb_model(x, y, init_hidden)
out_loss = dy_loss.numpy()
......@@ -407,9 +402,10 @@ def train_ptb_lm():
print("kpis\ttrain_ppl\t%0.3f" % ppl[0])
save_model_dir = os.path.join(args.save_model_dir,
str(epoch_id), 'params')
fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
paddle.fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
print("Saved model to: %s.\n" % save_model_dir)
eval(ptb_model, test_data)
paddle.enable_static()
#eval(ptb_model, test_data)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册