exe.run(startup_prog) show errors in train.py PaddleCv/face_detection
Created by: phamkhactu
i debug it show erros in line code: exe.run(startup_prog)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import shutil
import numpy as np
import time
import argparse
import functools
def set_paddle_flags(**kwargs):
for key, value in kwargs.items():
if os.environ.get(key, None) is None:
os.environ[key] = str(value)
# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect.
set_paddle_flags(
FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory
)
import paddle
import paddle.fluid as fluid
from pyramidbox import PyramidBox
import reader
from utility import add_arguments, print_arguments, check_cuda
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('parallel', bool, True, "Whether use multi-GPU/threads or not.")
add_arg('learning_rate', float, 0.001, "The start learning rate.")
add_arg('batch_size', int, 16, "Minibatch size.")
add_arg('epoc_num', int, 160, "Epoch number.")
add_arg('use_gpu', bool, True, "Whether use GPU.")
add_arg('use_pyramidbox', bool, True, "Whether use PyramidBox model.")
add_arg('model_save_dir', str, 'output', "The path to save model.")
add_arg('resize_h', int, 640, "The resized image height.")
add_arg('resize_w', int, 640, "The resized image width.")
add_arg('mean_BGR', str, '104., 117., 123.', "Mean value for B,G,R channel which will be subtracted.")
add_arg('pretrained_model', str, './vgg_ilsvrc_16_fc_reduced/', "The init model path.")
add_arg('data_dir', str, 'data', "The base dir of dataset")
add_arg('use_multiprocess', bool, False, "Whether use multi-process for data preprocessing.")
parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
parser.add_argument('--batch_num', type=int, help="batch num for ce")
parser.add_argument('--num_devices', type=int, default=1, help='Number of GPU devices')
#yapf: enable
train_parameters = {
"train_images": 12880,
"image_shape": [3, 640, 640],
"class_num": 2,
"batch_size": 16,
"lr": 0.001,
"lr_epochs": [99, 124, 149],
"lr_decay": [1, 0.1, 0.01, 0.001],
"epoc_num": 160,
"optimizer_method": "momentum",
"use_pyramidbox": True
}
def optimizer_setting(train_params):
batch_size = train_params["batch_size"]
iters = train_params["train_images"] // batch_size
lr = train_params["lr"]
optimizer_method = train_params["optimizer_method"]
boundaries = [i * iters for i in train_params["lr_epochs"]]
values = [i * lr for i in train_params["lr_decay"]]
if optimizer_method == "momentum":
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(boundaries, values),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(0.0005),
)
else:
optimizer = fluid.optimizer.RMSProp(
learning_rate=fluid.layers.piecewise_decay(boundaries, values),
regularization=fluid.regularizer.L2Decay(0.0005),
)
return optimizer
def build_program(train_params, main_prog, startup_prog, args):
use_pyramidbox = train_params["use_pyramidbox"]
image_shape = train_params["image_shape"]
class_num = train_params["class_num"]
with fluid.program_guard(main_prog, startup_prog):
py_reader = fluid.layers.py_reader(
capacity=8,
shapes=[[-1] + image_shape, [-1, 4], [-1, 4], [-1, 1]],
lod_levels=[0, 1, 1, 1],
dtypes=["float32", "float32", "float32", "int32"],
use_double_buffer=True)
with fluid.unique_name.guard():
image, face_box, head_box, gt_label = fluid.layers.read_file(py_reader)
fetches = []
network = PyramidBox(image=image,
face_box=face_box,
head_box=head_box,
gt_label=gt_label,
sub_network=use_pyramidbox)
if use_pyramidbox:
face_loss, head_loss, loss = network.train()
fetches = [face_loss, head_loss]
else:
loss = network.vgg_ssd_loss()
fetches = [loss]
optimizer = optimizer_setting(train_params)
optimizer.minimize(loss)
return py_reader, fetches, loss
def train(args, config, train_params, train_file_list):
batch_size = train_params["batch_size"]
epoc_num = train_params["epoc_num"]
optimizer_method = train_params["optimizer_method"]
use_pyramidbox = train_params["use_pyramidbox"]
use_gpu = args.use_gpu
model_save_dir = args.model_save_dir
pretrained_model = args.pretrained_model
devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
devices_num = len(devices.split(","))
batch_size_per_device = batch_size // devices_num
iters_per_epoc = train_params["train_images"] // batch_size
num_workers = 8
is_shuffle = True
startup_prog = fluid.Program()
train_prog = fluid.Program()
#only for ce
if args.enable_ce:
is_shuffle = False
SEED = 102
startup_prog.random_seed = SEED
train_prog.random_seed = SEED
num_workers = 1
pretrained_model = ""
if args.batch_num != None:
iters_per_epoc = args.batch_num
train_py_reader, fetches, loss = build_program(
train_params = train_params,
main_prog = train_prog,
startup_prog = startup_prog,
args=args)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
print(place)
exe = fluid.Executor(place)
exe.run(startup_prog) <---------------- here ---------------------
start_epoc = 0
if pretrained_model:
if pretrained_model.isdigit():
start_epoc = int(pretrained_model) + 1
pretrained_model = os.path.join(model_save_dir, pretrained_model)
print("Resume from %s " %(pretrained_model))
if not os.path.exists(pretrained_model):
raise ValueError("The pre-trained model path [%s] does not exist." %
(pretrained_model))
def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(
exe, pretrained_model, main_program=train_prog, predicate=if_exist)
train_reader = reader.train(config,
train_file_list,
batch_size_per_device,
shuffle = is_shuffle,
use_multiprocess=args.use_multiprocess,
num_workers=num_workers)
train_py_reader.decorate_paddle_reader(train_reader)
if args.parallel:
train_exe = fluid.ParallelExecutor(
main_program = train_prog,
use_cuda=use_gpu,
loss_name=loss.name)
def save_model(postfix, program):
model_path = os.path.join(model_save_dir, postfix)
if os.path.isdir(model_path):
shutil.rmtree(model_path)
print('save models to %s' % (model_path))
fluid.io.save_persistables(exe, model_path, main_program=program)
total_time = 0.0
epoch_idx = 0
face_loss = 0
head_loss = 0
for pass_id in range(start_epoc, epoc_num):
epoch_idx += 1
start_time = time.time()
prev_start_time = start_time
end_time = 0
batch_id = 0
train_py_reader.start()
while True:
try:
prev_start_time = start_time
start_time = time.time()
if args.parallel:
fetch_vars = train_exe.run(fetch_list=
[v.name for v in fetches])
else:
fetch_vars = exe.run(train_prog, fetch_list=fetches)
end_time = time.time()
fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
face_loss = fetch_vars[0]
head_loss = fetch_vars[1]
if batch_id % 10 == 0:
if not args.use_pyramidbox:
print("Pass {:d}, batch {:d}, loss {:.6f}, time {:.5f}".format(
pass_id, batch_id, face_loss,
start_time - prev_start_time))
else:
print("Pass {:d}, batch {:d}, face loss {:.6f}, " \
"head loss {:.6f}, " \
"time {:.5f}".format(pass_id,
batch_id, face_loss, head_loss,
start_time - prev_start_time))
batch_id += 1
except (fluid.core.EOFException, StopIteration):
train_py_reader.reset()
break
epoch_end_time = time.time()
total_time += epoch_end_time - start_time
save_model(str(pass_id), train_prog)
# only for ce
if args.enable_ce:
gpu_num = get_cards(args)
print("kpis\teach_pass_duration_card%s\t%s" %
(gpu_num, total_time / epoch_idx))
print("kpis\ttrain_face_loss_card%s\t%s" %
(gpu_num, face_loss))
print("kpis\ttrain_head_loss_card%s\t%s" %
(gpu_num, head_loss))
def get_cards(args):
if args.enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
else:
return args.num_devices
if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
check_cuda(args.use_gpu)
data_dir = os.path.join(args.data_dir, 'WIDER_train/images/')
train_file_list = os.path.join(args.data_dir,
'wider_face_split/wider_face_train_bbx_gt.txt')
mean_BGR = [float(m) for m in args.mean_BGR.split(",")]
image_shape = [3, int(args.resize_h), int(args.resize_w)]
train_parameters["image_shape"] = image_shape
train_parameters["use_pyramidbox"] = args.use_pyramidbox
train_parameters["batch_size"] = args.batch_size
train_parameters["lr"] = args.learning_rate
train_parameters["epoc_num"] = args.epoc_num
config = reader.Settings(
data_dir=data_dir,
resize_h=image_shape[1],
resize_w=image_shape[2],
apply_distort=True,
apply_expand=False,
mean_value=mean_BGR,
ap_version='11point')
train(args, config, train_parameters, train_file_list)
Here is errors:
----------- Configuration Arguments -----------
batch_num: None
batch_size: 16
data_dir: data
enable_ce: False
epoc_num: 160
learning_rate: 0.001
mean_BGR: 104., 117., 123.
model_save_dir: output
num_devices: 1
parallel: True
pretrained_model: ./vgg_ilsvrc_16_fc_reduced/
resize_h: 640
resize_w: 640
use_gpu: True
use_multiprocess: False
use_pyramidbox: True
------------------------------------------------
2020-02-28 08:19:13,201-WARNING: paddle.fluid.layers.py_reader() may be deprecated in the near future. Please use paddle.fluid.io.DataLoader.from_generator() instead.
CUDAPlace(0)
/usr/local/lib/python3.6/dist-packages/paddle/fluid/executor.py:779: UserWarning: The following exception is not an EOF exception.
"The following exception is not an EOF exception.")
Traceback (most recent call last):
File "/root/.vscode-server/extensions/ms-python.python-2020.2.64397/pythonFiles/ptvsd_launcher.py", line 48, in <module>
main(ptvsdArgs)
File "/root/.vscode-server/extensions/ms-python.python-2020.2.64397/pythonFiles/lib/python/old_ptvsd/ptvsd/__main__.py", line 432, in main
run()
File "/root/.vscode-server/extensions/ms-python.python-2020.2.64397/pythonFiles/lib/python/old_ptvsd/ptvsd/__main__.py", line 316, in run_file
runpy.run_path(target, run_name='__main__')
File "/usr/lib/python3.6/runpy.py", line 263, in run_path
pkg_name=pkg_name, script_name=fname)
File "/usr/lib/python3.6/runpy.py", line 96, in _run_module_code
mod_name, mod_spec, pkg_name, script_name)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/root/phamkhactu/face_detection/train.py", line 285, in <module>
train(args, config, train_parameters, train_file_list)
File "/root/phamkhactu/face_detection/train.py", line 158, in train
exe.run(startup_prog)
File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/executor.py", line 780, in run
six.reraise(*sys.exc_info())
File "/root/.local/lib/python3.6/site-packages/six.py", line 703, in reraise
raise value
File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/executor.py", line 775, in run
use_program_cache=use_program_cache)
File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/executor.py", line 822, in _run_impl
use_program_cache=use_program_cache)
File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/executor.py", line 899, in _run_program
fetch_var_name)
paddle.fluid.core_avx.EnforceNotMet:
--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0 std::string paddle::platform::GetTraceBackString<char const*>(char const*&&, char const*, int)
1 paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int)
2 paddle::platform::CUDADeviceContext::CUDADeviceContext(paddle::platform::CUDAPlace)
3 std::_Function_handler<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > (), std::reference_wrapper<std::_Bind_simple<paddle::platform::EmplaceDeviceContext<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(std::map<paddle::platform::Place, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > >, std::less<paddle::platform::Place>, std::allocator<std::pair<paddle::platform::Place const, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > > > > >*, paddle::platform::Place)::{lambda()#1} ()> > >::_M_invoke(std::_Any_data const&)
4 std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > >, std::__future_base::_Result_base::_Deleter>, std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > > >::_M_invoke(std::_Any_data const&)
5 std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>&, bool&)
6 std::__future_base::_Deferred_state<std::_Bind_simple<paddle::platform::EmplaceDeviceContext<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(std::map<paddle::platform::Place, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > >, std::less<paddle::platform::Place>, std::allocator<std::pair<paddle::platform::Place const, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > > > > >*, paddle::platform::Place)::{lambda()#1} ()>, std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > >::_M_run_deferred()
7 paddle::platform::DeviceContextPool::Get(paddle::platform::Place const&)
8 paddle::framework::GarbageCollector::GarbageCollector(paddle::platform::Place const&, unsigned long)
9 paddle::framework::UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(paddle::platform::CUDAPlace const&, unsigned long)
10 paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool)
11 paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector<std::string, std::allocator<std::string> > const&, bool)
----------------------
Error Message Summary:
----------------------
Error: Paddle internal Check failed. (Please help us create a new issue, here we need to find the developer to add a user friendly error message): out of memory at (/paddle/paddle/fluid/platform/device_context.cc:220)
and nvidia-smi show 11GB not use
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01 Driver Version: 440.33.01 CUDA Version: 10.2 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce RTX 208... On | 00000000:01:00.0 Off | N/A |
| 45% 56C P2 41W / 260W | 10905MiB / 11019MiB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 23333 G /usr/lib/xorg/Xorg 46MiB |
| 0 23412 G /usr/bin/sddm-greeter 48MiB |
+-----------------------------------------------------------------------------+