optimizer.minimize 的时候,报grad_op_maker_ should not be null
Created by: xiangyubo
- 版本、环境信息: 1)PaddlePaddle版本:1.2.0 2)AI Studio 上的环境 4)Python 3.5
- 复现信息:
- 问题描述:在 AI Studio 上运行我写的 paddle 代码训练 resnet 模型,运行时候报错。报错信息说 adam = fluid.optimizer.Adam(learning_rate=0.1) adam.minimize(avg_cost) <--------------这一行报错 最下面的异常信息是:
grad_op_descs.extend(grad_op_desc)
EnforceNotMet: grad_op_maker_ should not be null
Operator GradOpMaker has not been registered. at [/paddle/paddle/fluid/framework/op_info.h:61]
PaddlePaddle Call Stacks:
0 0x7f73e79158a6p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 486
完整异常栈:
Traceback (most recent call last)<ipython-input-2-a6ee3a8ed489> in <module>
262 main_program = fluid.default_main_program()
263 adam = fluid.optimizer.Adam(learning_rate=0.1)
--> 264 adam.minimize(avg_cost)
265 logger.info("create place, use gpu: " + str(train_parameters['use_gpu']))
266 exe = fluid.Executor(place)
/opt/conda/envs/py35-paddle1.2.0/lib/python3.5/site-packages/paddle/fluid/optimizer.py in minimize(self, loss, startup_program, parameter_list, no_grad_set)
303 """
304 params_grads = append_backward(loss, parameter_list, no_grad_set,
--> 305 [error_clip_callback])
306
307 params_grads = sorted(params_grads, key=lambda x: x[0].name)
/opt/conda/envs/py35-paddle1.2.0/lib/python3.5/site-packages/paddle/fluid/backward.py in append_backward(loss, parameter_list, no_grad_set, callbacks)
581
582 _append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
--> 583 grad_to_var, callbacks)
584
585 # Because calc_gradient may be called multiple times,
/opt/conda/envs/py35-paddle1.2.0/lib/python3.5/site-packages/paddle/fluid/backward.py in _append_backward_ops_(block, ops, target_block, no_grad_dict, grad_to_var, callbacks)
367 # Getting op's corresponding grad_op
368 grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
--> 369 op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
370
371 grad_op_descs.extend(grad_op_desc)
EnforceNotMet: grad_op_maker_ should not be null
Operator GradOpMaker has not been registered. at [/paddle/paddle/fluid/framework/op_info.h:61]
PaddlePaddle Call Stacks:
0 0x7f73e79158a6p paddle::platform::EnforceNotMet::EnforceNotMet(std::__exception_ptr::exception_ptr, char const*, int) + 486
我的 paddle 代码:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import numpy as np
import random
import time
import six
import sys
import functools
import math
import paddle
import paddle.fluid as fluid
import paddle.dataset.flowers as flowers
import argparse
import functools
import subprocess
import distutils.util
from paddle.fluid import core
from paddle.fluid.param_attr import ParamAttr
from PIL import Image, ImageEnhance
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
rq = time.strftime('%Y%m%d%H%M', time.localtime(time.time()))
log_path = os.path.join(os.getcwd(), 'logs')
if not os.path.exists(log_path):
os.makedirs(log_path)
log_name = os.path.join(log_path, 'train.log')
fh = logging.FileHandler(log_name, mode='w')
fh.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
fh.setFormatter(formatter)
logger.addHandler(fh)
train_parameters = {
"input_size": [3, 224, 224],
"class_dim": 10,
"input_mean": np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)),
"input_std": np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)),
"data_dir": "/home/work/xiangyubo/common_resource/classify-train-data",
"mode": "train",
"num_epochs": 30,
"train_batch_size": 64,
"use_gpu": True,
"learning_strategy": {
"name": "piecewise_decay",
"batch_size": 64,
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
}
}
class ResNet():
def __init__(self, layers=50):
self.params = train_parameters
self.layers = layers
def net(self, input, class_dim=1000):
layers = self.layers
supported_layers = [50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
num_filters = [64, 128, 256, 512]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu',
name="conv1")
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
for block in range(len(depth)):
for i in range(depth[block]):
if layers in [101, 152] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
name=conv_name)
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(input=pool,
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv,
stdv)))
return out
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False,
name=name + '.conv2d.output.1')
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
name=bn_name + '.output.1',
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance', )
def shortcut(self, input, ch_out, stride, name):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return input
def bottleneck_block(self, input, num_filters, stride, name):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=1,
act='relu',
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
conv2 = self.conv_bn_layer(
input=conv1,
num_filters=num_filters * 4,
filter_size=1,
act=None,
name=name + "_branch2c")
short = self.shortcut(
input, num_filters * 4, stride, name=name + "_branch1")
return fluid.layers.elementwise_add(
x=short, y=conv2, act='relu', name=name + ".add.output.5")
def ResNet50():
model = ResNet(layers=50)
return model
def ResNet101():
model = ResNet(layers=101)
return model
def ResNet152():
model = ResNet(layers=152)
return model
def resize_long(img, target_size):
percent_h = float(target_size[1]) / img.size[1]
percent_w = float(target_size[2]) / img.size[0]
percent = min(percent_h, percent_w)
resized_width = int(round(img.size[0] * percent))
resized_height = int(round(img.size[1] * percent))
img = img.resize((resized_width, resized_height), Image.LANCZOS)
return img
def custom_image_reader(file_list, data_dir, mode):
def reader():
with open(file_list) as flist:
lines = [line.strip() for line in flist]
np.random.shuffle(lines)
for line in lines:
if mode == 'train' or mode == 'val':
img_path, label = line.split()
img_path = os.path.join(data_dir, img_path)
img = Image.open(img_path)
img = resize_long(img, train_parameters['input_size'])
if img.mode != 'RGB':
img = img.convert('RGB')
img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
# img -= train_parameters['input_mean']
# img /= train_parameters['input_std']
target_size = train_parameters['input_size']
ret_x = np.zeros((target_size[0], target_size[1], target_size[2]), dtype=np.float32)
ret_x[0: img.shape[0], 0: img.shape[1], 0:img.shape[2]] = img
yield ret_x, int(label)
elif mode == 'test':
img_path = os.path.join(data_dir, line)
yield Image.open(img_path)
return reader
train_prog = fluid.Program()
train_startup = fluid.Program()
print("create prog success")
logger.info("create prog success")
file_list = os.path.join(train_parameters['data_dir'], "label.txt")
mode = train_parameters['mode']
batch_reader = paddle.batch(custom_image_reader(file_list, train_parameters['data_dir'], mode),
batch_size=train_parameters['train_batch_size'], drop_last=True)
print("create reader success, image size:")
print([-1] + train_parameters['input_size'])
logger.info("create place start, use gpu:" + str(train_parameters['use_gpu']))
place = fluid.CUDAPlace(7) if train_parameters['use_gpu'] else fluid.CPUPlace()
logger.info("build netowrk")
img = fluid.layers.data(name='img', shape=train_parameters['input_size'], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
model = ResNet101()
out = model.net(input=img, class_dim=train_parameters['class_dim'])
cost, pred = fluid.layers.softmax_with_cross_entropy(out, label, return_softmax=True)
logger.info(cost)
logger.info(pred)
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
acc_top3 = fluid.layers.accuracy(input=pred, label=label, k=3)
img = fluid.layers.data(name='img', shape=train_parameters['input_size'], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
main_program = fluid.default_main_program()
adam = fluid.optimizer.Adam(learning_rate=0.1)
adam.minimize(avg_cost)
logger.info("create place, use gpu: " + str(train_parameters['use_gpu']))
exe = fluid.Executor(place)
logger.info("create place, set executor place")
exe.run(fluid.default_startup_program())
logger.info("create place success")
train_fetch_list = [avg_cost.name, acc_top1.name, acc_top3.name, out.name]
successive_count = 0
stop_train = False
for pass_id in range(train_parameters["num_epochs"]):
logger.info("current pass: %d, start read image", pass_id)
train_info = [[], [], []]
train_time = []
batch_id = 0
for step_id, data in enumerate(batch_reader()):
t1 = time.time()
loss, acc1, acc3, pred_ot = exe.run(main_program,
feed=feeder.feed(data),
fetch_list=train_fetch_list)
t2 = time.time()
period = t2 - t1
loss = np.mean(np.array(loss))
acc1 = np.mean(np.array(acc1))
acc3 = np.mean(np.array(acc3))
train_info[0].append(loss)
train_info[1].append(acc1)
train_info[2].append(acc3)
train_time.append(period)
if batch_id % 1 == 0:
sys.stdout.flush()
logger.info("Pass {0}, trainbatch {1}, loss {2}, \
acc1 {3}, acc3 {4} time {5}"
.format(pass_id, batch_id, loss, acc1, acc3,
"%2.2f sec" % period))
batch_id += 1
if acc1 >= 0.90:
successive_count += 1
fluid.io.save_inference_model(params_filename ='./classify-model/resnet-classify-params',
model_filename='./classify-model/resnet-classify-model',
feeded_var_names=['img'], target_vars=[out], executor=exe)
if successive_count >= 3:
logger.info("end training")
stop_train = True
break
else:
successive_count = 0
if stop_train:
break
有一个相似的issue:https://github.com/PaddlePaddle/Paddle/issues/9281,但是我没用用到 parallel_op