未验证 提交 e83e3038 编写于 作者: L littletomatodonkey 提交者: GitHub

fix local rank get word size in dist (#402)

* fix local rank
* fix export model
上级 6a5f4626
......@@ -13,7 +13,6 @@
# limitations under the License.
import paddle
from paddle.distributed import ParallelEnv
import argparse
import os
......@@ -52,16 +51,18 @@ def main(args, return_dict={}):
config.mode = "valid"
# assign place
use_gpu = config.get("use_gpu", True)
place = 'gpu:{}'.format(ParallelEnv().dev_id) if use_gpu else 'cpu'
place = paddle.set_device(place)
place = paddle.set_device('gpu' if use_gpu else 'cpu')
use_data_parallel = int(os.getenv("PADDLE_TRAINERS_NUM", 1)) != 1
trainer_num = paddle.distributed.get_world_size()
use_data_parallel = trainer_num != 1
config["use_data_parallel"] = use_data_parallel
if config["use_data_parallel"]:
paddle.distributed.init_parallel_env()
net = program.create_model(config.ARCHITECTURE, config.classes_num)
if config["use_data_parallel"]:
strategy = paddle.distributed.init_parallel_env()
net = paddle.DataParallel(net, strategy)
net = paddle.DataParallel(net)
init_model(config, net, optimizer=None)
valid_dataloader = Reader(config, 'valid', places=place)()
......
......@@ -37,23 +37,17 @@ def parse_args():
"-o", "--output_path", type=str, default="./inference/cls_infer")
parser.add_argument("--class_dim", type=int, default=1000)
parser.add_argument("--load_static_weights", type=str2bool, default=False)
# parser.add_argument("--img_size", type=int, default=224)
parser.add_argument("--img_size", type=int, default=224)
return parser.parse_args()
class Net(paddle.nn.Layer):
def __init__(self, net, to_static, class_dim, model):
def __init__(self, net, class_dim, model):
super(Net, self).__init__()
self.pre_net = net(class_dim=class_dim)
self.to_static = to_static
self.model = model
# Please modify the 'shape' according to actual needs
@to_static(input_spec=[
paddle.static.InputSpec(
shape=[None, 3, 224, 224], dtype='float32')
])
def forward(self, inputs):
x = self.pre_net(inputs)
if self.model == "GoogLeNet":
......@@ -66,14 +60,19 @@ def main():
args = parse_args()
net = architectures.__dict__[args.model]
model = Net(net, to_static, args.class_dim, args.model)
model = Net(net, args.class_dim, args.model)
load_dygraph_pretrain(
model.pre_net,
path=args.pretrained_model,
load_static_weights=args.load_static_weights)
model.eval()
model = to_static(
model,
input_spec=[
paddle.static.InputSpec(
shape=[None, 3, args.img_size, args.img_size], dtype='float32')
])
paddle.jit.save(model, args.output_path)
......
......@@ -26,7 +26,6 @@ from ppcls.utils.save_load import load_dygraph_pretrain
from ppcls.modeling import architectures
import paddle
from paddle.distributed import ParallelEnv
import paddle.nn.functional as F
......@@ -64,8 +63,7 @@ def save_prelabel_results(class_id, input_filepath, output_idr):
def main():
args = utils.parse_args()
# assign the place
place = 'gpu:{}'.format(ParallelEnv().dev_id) if args.use_gpu else 'cpu'
place = paddle.set_device(place)
place = paddle.set_device('gpu' if args.use_gpu else 'cpu')
net = architectures.__dict__[args.model](class_dim=args.class_num)
load_dygraph_pretrain(net, args.pretrained_model, args.load_static_weights)
......
......@@ -149,7 +149,7 @@ def create_loss(out,
feed_lam = paddle.reshape(feeds['feed_lam'], [-1, 1])
else:
target = paddle.reshape(feeds['label'], [-1, 1])
if architecture["name"] == "GoogLeNet":
assert len(out) == 3, "GoogLeNet should have 3 outputs"
loss = GoogLeNetLoss(class_dim=classes_num, epsilon=epsilon)
......@@ -400,6 +400,7 @@ def compile(config, program, loss_name=None, share_prog=None):
total_step = 0
def run(dataloader,
exe,
program,
......@@ -409,8 +410,7 @@ def run(dataloader,
mode='train',
config=None,
vdl_writer=None,
lr_scheduler=None
):
lr_scheduler=None):
"""
Feed data to the model and fetch the measures and loss
......@@ -434,11 +434,13 @@ def run(dataloader,
tic = time.time()
for idx, batch in enumerate(dataloader()):
batch_size = batch[0].shape()[0]
feed_dict = {key.name:batch[idx] for idx, key in enumerate(feeds.values())}
metrics = exe.run(
program=program,
feed=feed_dict,
fetch_list=fetch_list)
feed_dict = {
key.name: batch[idx]
for idx, key in enumerate(feeds.values())
}
metrics = exe.run(program=program,
feed=feed_dict,
fetch_list=fetch_list)
batch_time.update(time.time() - tic)
tic = time.time()
......
......@@ -26,7 +26,6 @@ sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
from sys import version_info
import paddle
from paddle.distributed import ParallelEnv
from paddle.distributed import fleet
from ppcls.data import Reader
......@@ -66,8 +65,7 @@ def main(args):
# assign the place
use_gpu = config.get("use_gpu", True)
assert use_gpu is True, "gpu must be true in static mode!"
place = 'gpu:{}'.format(ParallelEnv().dev_id)
place = paddle.set_device(place)
place = paddle.set_device("gpu")
# startup_prog is used to do some parameter init work,
# and train prog is used to hold the network
......@@ -94,7 +92,7 @@ def main(args):
# load model from 1. checkpoint to resume training, 2. pretrained model to finetune
train_dataloader = Reader(config, 'train', places=place)()
if config.validate and ParallelEnv().local_rank == 0:
if config.validate and paddle.distributed.get_rank() == 0:
valid_dataloader = Reader(config, 'valid', places=place)()
compiled_valid_prog = program.compile(config, valid_prog)
......@@ -110,14 +108,15 @@ def main(args):
for epoch_id in range(config.epochs):
# 1. train with train dataset
program.run(train_dataloader, exe, train_prog, train_feeds, train_fetchs, epoch_id,
'train', config, vdl_writer, lr_scheduler)
if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
program.run(train_dataloader, exe, train_prog, train_feeds,
train_fetchs, epoch_id, 'train', config, vdl_writer,
lr_scheduler)
if paddle.distributed.get_rank() == 0:
# 2. validate with validate dataset
if config.validate and epoch_id % config.valid_interval == 0:
top1_acc = program.run(valid_dataloader, exe,
compiled_valid_prog, valid_feeds, valid_fetchs,
epoch_id, 'valid', config)
compiled_valid_prog, valid_feeds,
valid_fetchs, epoch_id, 'valid', config)
if top1_acc > best_top1_acc:
best_top1_acc = top1_acc
message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
......
......@@ -24,7 +24,6 @@ sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
import paddle
from paddle.distributed import ParallelEnv
from ppcls.data import Reader
from ppcls.utils.config import get_config
......@@ -57,29 +56,28 @@ def main(args):
config = get_config(args.config, overrides=args.override, show=True)
# assign the place
use_gpu = config.get("use_gpu", True)
place = 'gpu:{}'.format(ParallelEnv().dev_id) if use_gpu else 'cpu'
place = paddle.set_device(place)
place = paddle.set_device('gpu' if use_gpu else 'cpu')
trainer_num = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
trainer_num = paddle.distributed.get_world_size()
use_data_parallel = trainer_num != 1
config["use_data_parallel"] = use_data_parallel
if config["use_data_parallel"]:
strategy = paddle.distributed.init_parallel_env()
paddle.distributed.init_parallel_env()
net = program.create_model(config.ARCHITECTURE, config.classes_num)
optimizer, lr_scheduler = program.create_optimizer(
config, parameter_list=net.parameters())
if config["use_data_parallel"]:
net = paddle.DataParallel(net, strategy)
net = paddle.DataParallel(net)
# load model from checkpoint or pretrained model
init_model(config, net, optimizer)
train_dataloader = Reader(config, 'train', places=place)()
if config.validate and ParallelEnv().local_rank == 0:
if config.validate and paddle.distributed.get_rank() == 0:
valid_dataloader = Reader(config, 'valid', places=place)()
last_epoch_id = config.get("last_epoch", -1)
......@@ -91,7 +89,7 @@ def main(args):
program.run(train_dataloader, config, net, optimizer, lr_scheduler,
epoch_id, 'train')
if not config["use_data_parallel"] or ParallelEnv().local_rank == 0:
if paddle.distributed.get_rank() == 0:
# 2. validate with validate dataset
if config.validate and epoch_id % config.valid_interval == 0:
net.eval()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册