测试集精度太低
Created by: Annnnnnnnnnnnn
1、样本格式: 不定长输入(某用户搜过的词)
类似:[1, 3, 4, 5 ....] [label]
[1, 2, 3, .....] [label] 这个样子
2、处理方法:
利用LodTensor表征上述不定长输入(取对应位置的embedding池化(sum))转化成成定长输入
3、paddle版本:
1.3
4、网络格式:
与deepctr中一致,三层全链接层
5、问题:
训练集精度稳定上升。测试精度一直(注意:是一直,测试精度没有下降的过程,貌似和过拟合不太想)不高。
class ctr_model(object):
def __init__(self, args):
"""
创建整个模型
:param args: 配置文件
"""
self.args = args
place = fluid.CPUPlace()
# place = fluid.CUDAPlace(0)
self.exe = fluid.Executor(place)
self.main_program = fluid.default_main_program()
self._create_model()
# self._create_opts()
def _create_model(self):
"""
创建网络
"""
logger.info("create networks")
with fluid.program_guard(self.main_program):
with fluid.unique_name.guard():
self.py_reader, self.words, self.predict, self.loss, self.acc = \
ctr_dnn_model(self.args.embed_size, self.args.sparse_feature_dim,
is_training=self.args.is_training)
logger.info("create optimizer")
self.optimizer = fluid.optimizer.Adam(learning_rate=self.args.lr)
self.optimizer.minimize(self.loss)
def _create_opts(self):
"""
创建优化器
"""
logger.info("create optimizer")
self.optimizer = fluid.optimizer.Adam(learning_rate=self.args.lr)
self.optimizer.minimize(self.loss)
def train(self):
"""
循环迭代
"""
logger.info("run local training")
# print [p.name for p in self.main_program.global_block().all_parameters()]
# 必须在加载模型前调用,否则会覆盖已经加载的模型参数导致错误
self.exe.run(fluid.default_startup_program())
if not os.path.exists(self.args.checkpoint):
os.makedirs(self.args.checkpoint)
dataset = Reader(self.args.sparse_feature_dim)
train_reader = paddle.batch(paddle.reader.shuffle(dataset.feed(glob.glob(os.path.join(self.args.train_dir, "part-*"))),
buf_size=self.args.batch_size * 100), batch_size=self.args.batch_size)
self.py_reader.decorate_paddle_reader(train_reader)
feed_list = []
fetch_list = [self.words[1], self.predict, self.loss, self.acc]
# place = fluid.CPUPlace()
# exe = fluid.Executor(place)
exec_strategy = fluid.ExecutionStrategy()
build_strategy = fluid.BuildStrategy()
if os.getenv("NUM_THREADS", ""):
exec_strategy.num_threads = int(os.getenv("NUM_THREADS"))
cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
build_strategy.reduce_strategy = \
fluid.BuildStrategy.ReduceStrategy.Reduce if cpu_num > 1 \
else fluid.BuildStrategy.ReduceStrategy.AllReduce
self.compiled_program = fluid.CompiledProgram(self.main_program) \
.with_data_parallel(loss_name=self.loss.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
metric = fluid.metrics.Auc(name="train_auc")
for pass_id in xrange(self.args.num_passes):
pass_start = time.time()
self.py_reader.start()
batch_id = 0
try:
while True:
label, predict, loss, acc = self.exe.run(self.compiled_program,
fetch_list=[self.words[1].name,
self.predict.name,
self.loss.name,
self.acc.name])
loss = np.mean(loss)
acc = np.mean(acc)
metric.update(predict, label)
auc = metric.eval()
logger.info("TRAIN --> pass: {} batch: {:0=4} loss: {} acc: {} auc: {}"
.format(pass_id, batch_id, loss, acc, auc))
if batch_id % 100 == 0 and batch_id != 0:
if self.args.trainer_id == 0:
self.save(self.args.checkpoint, feed_list, fetch_list, pass_id, batch_id)
batch_id += 1
except fluid.core.EOFException:
self.py_reader.reset()
print("pass_id: %d, pass_time_cost: %f" % (pass_id, time.time() - pass_start))
if self.args.trainer_id == 0:
self.save(self.args.checkpoint, feed_list, fetch_list, pass_id, batch_id)
pass
def save(self, path, feed_list, fetch_list, pass_id, batch_id):
"""
保存checkpoint
:param path: 路径
:param pass_id: epoch次数
:param batch_id: 迭代次数
"""
checkpoint_name = "ctr_model.ckpt-%04d-%04d" % (pass_id, batch_id)
model_dir = os.path.join(path, checkpoint_name)
# fluid.io.save_persistables(self.exe, model_dir, self.main_program)
fluid.io.save_inference_model(model_dir, feed_list, fetch_list, self.exe, self.main_program)
def load(self, path):
"""
加载checkpoint
:param path:路径
"""
logger.info("[*] Reading checkpoint...")
models = glob.glob(os.path.join(path, "*.ckpt-*"))
if models:
models = sorted(models, key=lambda x: x[-4:])
fluid.io.load_persistables(self.exe, models[-1], self.main_program)
batch_id = int(models[-1][-4:])
return True
else:
return False
if __name__ == "__main__":
args = Config()
args = args()
ctr_demo = ctr_model(args)
ctr_demo.train()
验证:
# python
def eval():
"""
验证模型
"""
args = Config()
args = args()
place = fluid.CPUPlace()
inference_scope = fluid.core.Scope()
dataset = Reader(args.sparse_feature_dim)
test_reader = paddle.batch(dataset.feed(glob.glob(os.path.join(args.test_dir, "part-*"))),
batch_size=args.batch_size*100)
startup_program = fluid.framework.Program()
main_program = fluid.framework.Program()
with fluid.framework.program_guard(main_program, startup_program):
_, words, predict, loss, acc = ctr_dnn_model(args.embed_size, args.sparse_feature_dim, is_training=False)
feeder = fluid.DataFeeder(feed_list=words, place=place)
exe = fluid.Executor(place)
metric = fluid.metrics.Auc(name="valid_auc")
while True:
logger.info("[*] Reading checkpoint...")
ckpt = glob.glob(os.path.join(args.checkpoint, "*.ckpt-*"))
if ckpt:
logger.info("[*] Load Success...")
ckpt = sorted(ckpt, key=lambda x: x[-9:])[-1]
logger.info(ckpt)
with fluid.scope_guard(inference_scope):
[inference_program, _, fetch_targets] = fluid.io.load_inference_model(ckpt, exe)
def set_zero(var_name):
"""set auc state list to 0"""
param = inference_scope.var(var_name).get_tensor()
param_array = np.zeros(param._get_dims()).astype("int64")
param.set(param_array, place)
auc_states_names = ['_generated_var_2', '_generated_var_3']
# for name in auc_states_names:
# set_zero(name)
for batch_id, data in enumerate(test_reader()):
label, predict, loss, acc = exe.run(inference_program,
feed=feeder.feed(data),
fetch_list=fetch_targets)
loss = np.mean(loss)
acc = np.mean(acc)
metric.update(predict, label)
auc = metric.eval()
# if batch_id % 100 == 0:
logger.info("TEST --> batch: {} loss: {} acc: {} auc: {}".format(batch_id, loss, acc, auc))
else:
logger.info("No checkpoint file found")
time.sleep(EVAL_INTERVAL_SECS)
if __name__ == '__main__':
eval()