验证集上效果和测试集上效果相差较大
Created by: JingChunzhen
目前遇到的问题是验证集上的效果和测试集上的效果相差较大,当前相差五个点左右,可能的原因是在训练过程中将验证集上的数据也train了,以下是我的训练脚本,目前暂未排查出问题,希望paddle同学可以协助排查,非常感谢!
paddle 1.5.0版本
#!/usr/bin/env python
#-*- coding:utf8 -*-
import json
import os
import sys
import paddle
import paddle.fluid as fluid
import paddle.fluid.param_attr as attr
import numpy as np
import sklearn
from sklearn.metrics import roc_auc_score
from model.double_tower import ResnetErnie
from model.double_tower import ResnetBow
from reader.tokenization import FullTokenizer
from reader.relevance_reader import RelevanceReader
def main():
instance = ResnetBow()
instance.forward()
pred = instance.logits
loss = instance.loss
input_label = instance.input_hard_label
test_program = fluid.default_main_program().clone(for_test=True)
fluid.optimizer.Adam(learning_rate=0.001).minimize(loss)
places = fluid.cuda_places() if config_use_cuda else fluid.CPUPlace()
place = fluid.CUDAPlace(0)
print(fluid.core.get_cuda_device_count())
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
init_model(
exe=exe,
program=fluid.default_main_program(),
file_name=some_file_name)
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = fluid.core.get_cuda_device_count()
exec_strategy.num_iteration_per_drop_scope = 100
build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = True
train_exe = fluid.ParallelExecutor(
use_cuda=config_use_cuda,
main_program=fluid.default_main_program(),
loss_name=loss.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
test_exe = fluid.ParallelExecutor(
use_cuda=config_use_cuda,
main_program=test_program,
build_strategy=build_strategy,
share_vars_from=train_exe,
exec_strategy=exec_strategy)
data_reader = RelevanceReader()
feed_list = [
instance.input_src_ids,
instance.input_txt_ids,
instance.input_pos_ids,
instance.input_mask,
instance.input_image,
instance.input_hard_label]
train_batch_gen = data_reader.multiprocessing_wrapper(
file_names=train_file_name,
data_sizes=train_data_size,
num_workers=5,
epochs=10)
test_batch_gen = data_reader.batch_wrapper(
file_name=test_file_name,
data_size=test_data_size,
batch_size=128,
shuffle=True)
train_reader = fluid.io.PyReader(
feed_list=feed_list,
capacity=5,
use_double_buffer=True,
iterable=True)
train_reader.decorate_batch_generator(train_batch_gen, places=places)
test_reader = fluid.io.PyReader(
feed_list=feed_list,
capacity=5,
use_double_buffer=True,
iterable=True)
test_reader.decorate_batch_generator(test_batch_gen, places=places)
cnt = 0
for train_data in train_reader():
_loss = train_exe.run(
feed=train_data,
fetch_list=[loss.name])
print("{}\t{}".format(cnt, _loss[0]))
cnt += 1
if cnt % 200 == 0:
test_cnt = 0
auces = []
losses = []
for test_data in test_reader():
_test_loss, _pred, _label = test_exe.run(
feed=test_data,
fetch_list=[loss.name, pred.name, input_label.name],
return_numpy=True)
test_cnt += 1
if test_cnt >= 5:
break
if __name__ == "__main__":
config_use_cuda = True
train_file_names = "/home/work/train_set"
train_data_sizes = 10078937
test_file_name = "/home/work/dev.data"
test_data_size = 1110000
main()