PaddleCV/image_classification 训练时报错(CPU没问题GPU出错)
Created by: liyutg
paddle 版本 1.3.2 GPU 型号 GeForce GTX 1080 8119MiB Tesla K40m 11441MiB Please NOTE: device: 1, CUDA Capability: 35, Driver API Version: 10.1, Runtime API Version: 9.0 device: 1, cuDNN Version: 7.0.
项目配置信息 ------------- Configuration Arguments ------------- batch_size : 256 checkpoint : None class_dim : 61 data_dir : ./data/ILSVRC2012 enable_ce : False fp16 : False image_shape : 3,224,224 l2_decay : 0.0001 lr : 0.01 lr_strategy : piecewise_decay model : AlexNet model_save_dir : output momentum_rate : 0.9 num_epochs : 1 pretrained_model : None scale_loss : 1.0 total_images : 31718 use_gpu : True with_mem_opt : False
报错信息
* Aborted at 1555033639 (unix time) try "date -d @1555033639" if you are using GNU date *
PC: @ 0x0 (unknown)* SIGSEGV (@0x0) received by PID 22659 (TID 0x7f0f01560740) from PID 0; stack trace: *
@ 0x7f0f00a55330 (unknown) @ 0x0 (unknown) bash: 行 1: 22659 段错误 (核心已转储) env "PYCHARM_MATPLOTLIB_PORT"="59714" "PYTHONPATH"="/home/masdir/.pycharm_helpers/pycharm_matplotlib_backend:/home/data/cwz/py36/image_classification" "PYTHONUNBUFFERED"="1" "LD_LIBRARY_PATH"="/usr/local/cuda/lib64" "FLAGS_fraction_of_gpu_memory_to_use"="0.99" "JETBRAINS_REMOTE_RUN"="1" "PYTHONIOENCODING"="UTF-8" "PYCHARM_HOSTED"="1" /home/data/anaconda3/envs/py36-cwz/bin/python3.6 -u /home/data/cwz/py36/image_classification/train.pyProcess finished with exit code 139
报错代码位置
对比样例只修改了数据读入方式的小部分代码 ` def reader(): print("## now in reader") data_dict = {} with open(file_list) as flist: full_lines = json.load(flist) if shuffle: if pass_id_as_seed: np.random.seed(pass_id_as_seed) np.random.shuffle(full_lines)
# for l in full_lines:
# print(l)
print("## now in reader1")
if mode == 'train' and os.getenv('PADDLE_TRAINING_ROLE'):
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
per_node_lines = len(full_lines) // trainer_count
lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
* per_node_lines]
print(
"read images from %d, length: %d, lines length: %d, total: %d"
% (trainer_id * per_node_lines, per_node_lines, len(lines),
len(full_lines)))
else:
lines = full_lines
print("## now in reader2")
img_name = []
for image in lines:
img_name.append(image['image_id'])
data_dict[image['image_id']] = int(image['disease_class'])
# print(img_name[img_name.index(image['image_id'])])
print("## now in reader3")
for index in range(len(img_name)):
if mode == 'train':
img_path = img_name[index]
img_path = img_path.replace("JPEG", "jpeg")
img_path = data_dir + '/train/' + img_path
label = data_dict[img_name[index]]
yield img_path, int(label)
elif mode == 'val':
img_path = img_name[index]
img_path = img_path.replace("JPEG", "jpeg")
img_path = data_dir + '/val/' + img_path
label = data_dict[img_name[index]]
yield img_path, int(label)
elif mode == 'test':
img_path = img_name[index]
img_path = img_path.replace("JPEG", "jpeg")
img_path = os.path.join(data_dir, img_path)
yield [img_path]`