exe.run(startup_prog) show errors in train.py PaddleCv/face_detection (#4361) · Issue · PaddlePaddle / models

exe.run(startup_prog) show errors in train.py PaddleCv/face_detection

Created by: phamkhactu

i debug it show erros in line code: exe.run(startup_prog)

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import shutil
import numpy as np
import time
import argparse
import functools


def set_paddle_flags(**kwargs):
    for key, value in kwargs.items():
        if os.environ.get(key, None) is None:
            os.environ[key] = str(value)


# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect. 
set_paddle_flags(
    FLAGS_eager_delete_tensor_gb=0,  # enable GC to save memory
)

import paddle
import paddle.fluid as fluid
from pyramidbox import PyramidBox
import reader
from utility import add_arguments, print_arguments, check_cuda

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)

# yapf: disable
add_arg('parallel',         bool,  True,            "Whether use multi-GPU/threads or not.")
add_arg('learning_rate',    float, 0.001,           "The start learning rate.")
add_arg('batch_size',       int,   16,              "Minibatch size.")
add_arg('epoc_num',         int,   160,             "Epoch number.")
add_arg('use_gpu',          bool,  True,            "Whether use GPU.")
add_arg('use_pyramidbox',   bool,  True,            "Whether use PyramidBox model.")
add_arg('model_save_dir',   str,   'output',        "The path to save model.")
add_arg('resize_h',         int,   640,             "The resized image height.")
add_arg('resize_w',         int,   640,             "The resized image width.")
add_arg('mean_BGR',         str,   '104., 117., 123.', "Mean value for B,G,R channel which will be subtracted.")
add_arg('pretrained_model', str,   './vgg_ilsvrc_16_fc_reduced/', "The init model path.")
add_arg('data_dir',         str,   'data',          "The base dir of dataset")
add_arg('use_multiprocess', bool,  False,            "Whether use multi-process for data preprocessing.")
parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
parser.add_argument('--batch_num', type=int, help="batch num for ce")
parser.add_argument('--num_devices', type=int, default=1, help='Number of GPU devices')
#yapf: enable

train_parameters = {
    "train_images": 12880,
    "image_shape": [3, 640, 640],
    "class_num": 2,
    "batch_size": 16,
    "lr": 0.001,
    "lr_epochs": [99, 124, 149],
    "lr_decay": [1, 0.1, 0.01, 0.001],
    "epoc_num": 160,
    "optimizer_method": "momentum",
    "use_pyramidbox": True
}

def optimizer_setting(train_params):
    batch_size = train_params["batch_size"]
    iters = train_params["train_images"] // batch_size
    lr = train_params["lr"]
    optimizer_method = train_params["optimizer_method"]
    boundaries = [i * iters for i in train_params["lr_epochs"]]
    values = [i * lr for i in train_params["lr_decay"]]

    if optimizer_method == "momentum":
        optimizer = fluid.optimizer.Momentum(
            learning_rate=fluid.layers.piecewise_decay(boundaries, values),
            momentum=0.9,
            regularization=fluid.regularizer.L2Decay(0.0005),
        )
    else:
        optimizer = fluid.optimizer.RMSProp(
            learning_rate=fluid.layers.piecewise_decay(boundaries, values),
            regularization=fluid.regularizer.L2Decay(0.0005),
        )
    return optimizer


def build_program(train_params, main_prog, startup_prog, args):
    use_pyramidbox = train_params["use_pyramidbox"]
    image_shape = train_params["image_shape"]
    class_num = train_params["class_num"]
    with fluid.program_guard(main_prog, startup_prog):
        py_reader = fluid.layers.py_reader(
            capacity=8,
            shapes=[[-1] + image_shape, [-1, 4], [-1, 4], [-1, 1]],
            lod_levels=[0, 1, 1, 1],
            dtypes=["float32", "float32", "float32", "int32"],
            use_double_buffer=True)
        with fluid.unique_name.guard():
            image, face_box, head_box, gt_label = fluid.layers.read_file(py_reader)
            fetches = []
            network = PyramidBox(image=image,
                                 face_box=face_box,
                                 head_box=head_box,
                                 gt_label=gt_label,
                                 sub_network=use_pyramidbox)
            if use_pyramidbox:
                face_loss, head_loss, loss = network.train()
                fetches = [face_loss, head_loss]
            else:
                loss = network.vgg_ssd_loss()
                fetches = [loss]
            optimizer = optimizer_setting(train_params)
            optimizer.minimize(loss)
    return py_reader, fetches, loss

def train(args, config, train_params, train_file_list):
    batch_size = train_params["batch_size"]
    epoc_num = train_params["epoc_num"]
    optimizer_method = train_params["optimizer_method"]
    use_pyramidbox = train_params["use_pyramidbox"]

    use_gpu = args.use_gpu
    model_save_dir = args.model_save_dir
    pretrained_model = args.pretrained_model

    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
    batch_size_per_device = batch_size // devices_num
    iters_per_epoc = train_params["train_images"] // batch_size
    num_workers = 8
    is_shuffle = True

    startup_prog = fluid.Program()
    train_prog = fluid.Program()

    #only for ce
    if args.enable_ce:
        is_shuffle = False
        SEED = 102
        startup_prog.random_seed = SEED
        train_prog.random_seed = SEED
        num_workers = 1
        pretrained_model = ""
        if args.batch_num != None:
            iters_per_epoc = args.batch_num

    train_py_reader, fetches, loss = build_program(
        train_params = train_params,
        main_prog = train_prog,
        startup_prog = startup_prog,
        args=args)

    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    print(place)
    exe = fluid.Executor(place)
    exe.run(startup_prog)  <---------------- here ---------------------
    start_epoc = 0
    if pretrained_model:
        if pretrained_model.isdigit():
            start_epoc = int(pretrained_model) + 1
            pretrained_model = os.path.join(model_save_dir, pretrained_model)
            print("Resume from %s " %(pretrained_model))

        if not os.path.exists(pretrained_model):
            raise ValueError("The pre-trained model path [%s] does not exist." %
                             (pretrained_model))
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))
        fluid.io.load_vars(
            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
    train_reader = reader.train(config,
                                train_file_list,
                                batch_size_per_device,
                                shuffle = is_shuffle,
                                use_multiprocess=args.use_multiprocess,
                                num_workers=num_workers)
    train_py_reader.decorate_paddle_reader(train_reader)

    if args.parallel:
        train_exe = fluid.ParallelExecutor(
            main_program = train_prog,
            use_cuda=use_gpu,
            loss_name=loss.name)

    def save_model(postfix, program):
        model_path = os.path.join(model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)

        print('save models to %s' % (model_path))
        fluid.io.save_persistables(exe, model_path, main_program=program)

    total_time = 0.0
    epoch_idx = 0
    face_loss = 0
    head_loss = 0
    for pass_id in range(start_epoc, epoc_num):
        epoch_idx += 1
        start_time = time.time()
        prev_start_time = start_time
        end_time = 0
        batch_id = 0
        train_py_reader.start()
        while True:
            try:
                prev_start_time = start_time
                start_time = time.time()
                if args.parallel:
                    fetch_vars = train_exe.run(fetch_list=
                        [v.name for v in fetches])
                else:
                    fetch_vars = exe.run(train_prog, fetch_list=fetches)
                end_time = time.time()
                fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
                face_loss = fetch_vars[0]
                head_loss = fetch_vars[1]
                if batch_id % 10 == 0:
                    if not args.use_pyramidbox:
                        print("Pass {:d}, batch {:d}, loss {:.6f}, time {:.5f}".format(
                            pass_id, batch_id, face_loss,
                            start_time - prev_start_time))
                    else:
                        print("Pass {:d}, batch {:d}, face loss {:.6f}, " \
                              "head loss {:.6f}, " \
                              "time {:.5f}".format(pass_id,
                               batch_id, face_loss, head_loss,
                               start_time - prev_start_time))
                batch_id += 1
            except (fluid.core.EOFException, StopIteration):
                train_py_reader.reset()
                break
        epoch_end_time = time.time()
        total_time += epoch_end_time - start_time
        save_model(str(pass_id), train_prog)

    # only for ce
    if args.enable_ce:
        gpu_num = get_cards(args)
        print("kpis\teach_pass_duration_card%s\t%s" %
                (gpu_num, total_time / epoch_idx))
        print("kpis\ttrain_face_loss_card%s\t%s" %
                (gpu_num, face_loss))
        print("kpis\ttrain_head_loss_card%s\t%s" %
                (gpu_num, head_loss))



def get_cards(args):
    if args.enable_ce:
        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
        num = len(cards.split(","))
        return num
    else:
        return args.num_devices


if __name__ == '__main__':
    args = parser.parse_args()
    print_arguments(args)
    check_cuda(args.use_gpu)

    data_dir = os.path.join(args.data_dir, 'WIDER_train/images/')
    train_file_list = os.path.join(args.data_dir,
        'wider_face_split/wider_face_train_bbx_gt.txt')
    mean_BGR = [float(m) for m in args.mean_BGR.split(",")]
    image_shape = [3, int(args.resize_h), int(args.resize_w)]
    train_parameters["image_shape"] = image_shape
    train_parameters["use_pyramidbox"] = args.use_pyramidbox
    train_parameters["batch_size"] = args.batch_size
    train_parameters["lr"] = args.learning_rate
    train_parameters["epoc_num"] = args.epoc_num


    config = reader.Settings(
        data_dir=data_dir,
        resize_h=image_shape[1],
        resize_w=image_shape[2],
        apply_distort=True,
        apply_expand=False,
        mean_value=mean_BGR,
        ap_version='11point')
    train(args, config, train_parameters, train_file_list)

Here is errors:

-----------  Configuration Arguments -----------
batch_num: None
batch_size: 16
data_dir: data
enable_ce: False
epoc_num: 160
learning_rate: 0.001
mean_BGR: 104., 117., 123.
model_save_dir: output
num_devices: 1
parallel: True
pretrained_model: ./vgg_ilsvrc_16_fc_reduced/
resize_h: 640
resize_w: 640
use_gpu: True
use_multiprocess: False
use_pyramidbox: True
------------------------------------------------
2020-02-28 08:19:13,201-WARNING: paddle.fluid.layers.py_reader() may be deprecated in the near future. Please use paddle.fluid.io.DataLoader.from_generator() instead.
CUDAPlace(0)
/usr/local/lib/python3.6/dist-packages/paddle/fluid/executor.py:779: UserWarning: The following exception is not an EOF exception.
  "The following exception is not an EOF exception.")
Traceback (most recent call last):
  File "/root/.vscode-server/extensions/ms-python.python-2020.2.64397/pythonFiles/ptvsd_launcher.py", line 48, in <module>
    main(ptvsdArgs)
  File "/root/.vscode-server/extensions/ms-python.python-2020.2.64397/pythonFiles/lib/python/old_ptvsd/ptvsd/__main__.py", line 432, in main
    run()
  File "/root/.vscode-server/extensions/ms-python.python-2020.2.64397/pythonFiles/lib/python/old_ptvsd/ptvsd/__main__.py", line 316, in run_file
    runpy.run_path(target, run_name='__main__')
  File "/usr/lib/python3.6/runpy.py", line 263, in run_path
    pkg_name=pkg_name, script_name=fname)
  File "/usr/lib/python3.6/runpy.py", line 96, in _run_module_code
    mod_name, mod_spec, pkg_name, script_name)
  File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/root/phamkhactu/face_detection/train.py", line 285, in <module>
    train(args, config, train_parameters, train_file_list)
  File "/root/phamkhactu/face_detection/train.py", line 158, in train
    exe.run(startup_prog)
  File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/executor.py", line 780, in run
    six.reraise(*sys.exc_info())
  File "/root/.local/lib/python3.6/site-packages/six.py", line 703, in reraise
    raise value
  File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/executor.py", line 775, in run
    use_program_cache=use_program_cache)
  File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/executor.py", line 822, in _run_impl
    use_program_cache=use_program_cache)
  File "/usr/local/lib/python3.6/dist-packages/paddle/fluid/executor.py", line 899, in _run_program
    fetch_var_name)
paddle.fluid.core_avx.EnforceNotMet: 

--------------------------------------------
C++ Call Stacks (More useful to developers):
--------------------------------------------
0   std::string paddle::platform::GetTraceBackString<char const*>(char const*&&, char const*, int)
1   paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int)
2   paddle::platform::CUDADeviceContext::CUDADeviceContext(paddle::platform::CUDAPlace)
3   std::_Function_handler<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > (), std::reference_wrapper<std::_Bind_simple<paddle::platform::EmplaceDeviceContext<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(std::map<paddle::platform::Place, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > >, std::less<paddle::platform::Place>, std::allocator<std::pair<paddle::platform::Place const, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > > > > >*, paddle::platform::Place)::{lambda()#1} ()> > >::_M_invoke(std::_Any_data const&)
4   std::_Function_handler<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> (), std::__future_base::_Task_setter<std::unique_ptr<std::__future_base::_Result<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > >, std::__future_base::_Result_base::_Deleter>, std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > > >::_M_invoke(std::_Any_data const&)
5   std::__future_base::_State_base::_M_do_set(std::function<std::unique_ptr<std::__future_base::_Result_base, std::__future_base::_Result_base::_Deleter> ()>&, bool&)
6   std::__future_base::_Deferred_state<std::_Bind_simple<paddle::platform::EmplaceDeviceContext<paddle::platform::CUDADeviceContext, paddle::platform::CUDAPlace>(std::map<paddle::platform::Place, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > >, std::less<paddle::platform::Place>, std::allocator<std::pair<paddle::platform::Place const, std::shared_future<std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > > > > >*, paddle::platform::Place)::{lambda()#1} ()>, std::unique_ptr<paddle::platform::DeviceContext, std::default_delete<paddle::platform::DeviceContext> > >::_M_run_deferred()
7   paddle::platform::DeviceContextPool::Get(paddle::platform::Place const&)
8   paddle::framework::GarbageCollector::GarbageCollector(paddle::platform::Place const&, unsigned long)
9   paddle::framework::UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(paddle::platform::CUDAPlace const&, unsigned long)
10  paddle::framework::Executor::RunPreparedContext(paddle::framework::ExecutorPrepareContext*, paddle::framework::Scope*, bool, bool, bool)
11  paddle::framework::Executor::Run(paddle::framework::ProgramDesc const&, paddle::framework::Scope*, int, bool, bool, std::vector<std::string, std::allocator<std::string> > const&, bool)

----------------------
Error Message Summary:
----------------------
Error: Paddle internal Check failed. (Please help us create a new issue, here we need to find the developer to add a user friendly error message): out of memory at (/paddle/paddle/fluid/platform/device_context.cc:220)

and nvidia-smi show 11GB not use

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 440.33.01    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce RTX 208...  On   | 00000000:01:00.0 Off |                  N/A |
| 45%   56C    P2    41W / 260W |  10905MiB / 11019MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|    0     23333      G   /usr/lib/xorg/Xorg                            46MiB |
|    0     23412      G   /usr/bin/sddm-greeter                         48MiB |
+-----------------------------------------------------------------------------+

PaddlePaddle / models 大约 2 年 前同步成功

exe.run(startup_prog) show errors in train.py PaddleCv/face_detection

i debug it show erros in line code: exe.run(startup_prog)

Here is errors:

and nvidia-smi show 11GB not use

PaddlePaddle / models
大约 2 年前同步成功