未验证 提交 eb32db18 编写于 作者: Z zhengya01 提交者: GitHub

Merge pull request #14 from PaddlePaddle/develop

update
...@@ -39,6 +39,8 @@ You can test if distributed training works on a single node before deploying to ...@@ -39,6 +39,8 @@ You can test if distributed training works on a single node before deploying to
***NOTE: for best performance, we recommend using multi-process mode, see No.3. And together with fp16.*** ***NOTE: for best performance, we recommend using multi-process mode, see No.3. And together with fp16.***
***NOTE: for nccl2 distributed mode, you must ensure each node train same number of samples, or set skip_unbalanced_data to 1 to do sync training.***
1. simply run `python dist_train.py` to start local training with default configuratioins. 1. simply run `python dist_train.py` to start local training with default configuratioins.
2. for pserver mode, run `bash run_ps_mode.sh` to start 2 pservers and 2 trainers, these 2 trainers 2. for pserver mode, run `bash run_ps_mode.sh` to start 2 pservers and 2 trainers, these 2 trainers
will use GPU 0 and 1 to simulate 2 workers. will use GPU 0 and 1 to simulate 2 workers.
...@@ -90,4 +92,19 @@ The default resnet50 distributed training config is based on this paper: https:/ ...@@ -90,4 +92,19 @@ The default resnet50 distributed training config is based on this paper: https:/
### Performance ### Performance
TBD The below figure shows fluid distributed training performances. We did these on a 4-node V100 GPU cluster,
each has 8 V100 GPU card, with total of 32 GPUs. All modes can reach the "state of the art (choose loss scale carefully when using fp16 mode)" of ResNet50 model with imagenet dataset. The Y axis in the figure shows
the images/s while the X-axis shows the number of GPUs.
<p align="center">
<img src="../images/imagenet_dist_performance.png" width=528> <br />
Performance of Multiple-GPU Training of Resnet50 on Imagenet
</p>
The second figure shows speed-ups when using multiple GPUs according to the above figure.
<p align="center">
<img src="../images/imagenet_dist_speedup.png" width=528> <br />
Speed-ups of Multiple-GPU Training of Resnet50 on Imagenet
</p>
...@@ -55,7 +55,7 @@ def eval(): ...@@ -55,7 +55,7 @@ def eval():
add_conv_body_func=resnet.add_ResNet50_conv4_body, add_conv_body_func=resnet.add_ResNet50_conv4_body,
add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head, add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
use_pyreader=False, use_pyreader=False,
is_train=False) mode='val')
model.build_model(image_shape) model.build_model(image_shape)
pred_boxes = model.eval_bbox_out() pred_boxes = model.eval_bbox_out()
if cfg.MASK_ON: if cfg.MASK_ON:
......
...@@ -35,7 +35,7 @@ def infer(): ...@@ -35,7 +35,7 @@ def infer():
add_conv_body_func=resnet.add_ResNet50_conv4_body, add_conv_body_func=resnet.add_ResNet50_conv4_body,
add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head, add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
use_pyreader=False, use_pyreader=False,
is_train=False) mode='infer')
model.build_model(image_shape) model.build_model(image_shape)
pred_boxes = model.eval_bbox_out() pred_boxes = model.eval_bbox_out()
if cfg.MASK_ON: if cfg.MASK_ON:
......
...@@ -25,12 +25,12 @@ class RCNN(object): ...@@ -25,12 +25,12 @@ class RCNN(object):
def __init__(self, def __init__(self,
add_conv_body_func=None, add_conv_body_func=None,
add_roi_box_head_func=None, add_roi_box_head_func=None,
is_train=True, mode='train',
use_pyreader=True, use_pyreader=True,
use_random=True): use_random=True):
self.add_conv_body_func = add_conv_body_func self.add_conv_body_func = add_conv_body_func
self.add_roi_box_head_func = add_roi_box_head_func self.add_roi_box_head_func = add_roi_box_head_func
self.is_train = is_train self.mode = mode
self.use_pyreader = use_pyreader self.use_pyreader = use_pyreader
self.use_random = use_random self.use_random = use_random
...@@ -41,7 +41,7 @@ class RCNN(object): ...@@ -41,7 +41,7 @@ class RCNN(object):
self.rpn_heads(body_conv) self.rpn_heads(body_conv)
# Fast RCNN # Fast RCNN
self.fast_rcnn_heads(body_conv) self.fast_rcnn_heads(body_conv)
if not self.is_train: if self.mode != 'train':
self.eval_bbox() self.eval_bbox()
# Mask RCNN # Mask RCNN
if cfg.MASK_ON: if cfg.MASK_ON:
...@@ -115,7 +115,9 @@ class RCNN(object): ...@@ -115,7 +115,9 @@ class RCNN(object):
name='gt_masks', shape=[2], dtype='float32', lod_level=3) name='gt_masks', shape=[2], dtype='float32', lod_level=3)
def feeds(self): def feeds(self):
if not self.is_train: if self.mode == 'infer':
return [self.image, self.im_info]
if self.mode == 'val':
return [self.image, self.im_info, self.im_id] return [self.image, self.im_info, self.im_id]
if not cfg.MASK_ON: if not cfg.MASK_ON:
return [ return [
...@@ -213,7 +215,7 @@ class RCNN(object): ...@@ -213,7 +215,7 @@ class RCNN(object):
rpn_cls_score_prob = fluid.layers.sigmoid( rpn_cls_score_prob = fluid.layers.sigmoid(
self.rpn_cls_score, name='rpn_cls_score_prob') self.rpn_cls_score, name='rpn_cls_score_prob')
param_obj = cfg.TRAIN if self.is_train else cfg.TEST param_obj = cfg.TRAIN if self.mode == 'train' else cfg.TEST
pre_nms_top_n = param_obj.rpn_pre_nms_top_n pre_nms_top_n = param_obj.rpn_pre_nms_top_n
post_nms_top_n = param_obj.rpn_post_nms_top_n post_nms_top_n = param_obj.rpn_post_nms_top_n
nms_thresh = param_obj.rpn_nms_thresh nms_thresh = param_obj.rpn_nms_thresh
...@@ -230,7 +232,7 @@ class RCNN(object): ...@@ -230,7 +232,7 @@ class RCNN(object):
nms_thresh=nms_thresh, nms_thresh=nms_thresh,
min_size=min_size, min_size=min_size,
eta=eta) eta=eta)
if self.is_train: if self.mode == 'train':
outs = fluid.layers.generate_proposal_labels( outs = fluid.layers.generate_proposal_labels(
rpn_rois=self.rpn_rois, rpn_rois=self.rpn_rois,
gt_classes=self.gt_label, gt_classes=self.gt_label,
...@@ -267,7 +269,7 @@ class RCNN(object): ...@@ -267,7 +269,7 @@ class RCNN(object):
self.mask_int32 = mask_out[2] self.mask_int32 = mask_out[2]
def fast_rcnn_heads(self, roi_input): def fast_rcnn_heads(self, roi_input):
if self.is_train: if self.mode == 'train':
pool_rois = self.rois pool_rois = self.rois
else: else:
pool_rois = self.rpn_rois pool_rois = self.rpn_rois
...@@ -311,7 +313,7 @@ class RCNN(object): ...@@ -311,7 +313,7 @@ class RCNN(object):
bias_attr=ParamAttr( bias_attr=ParamAttr(
name='conv5_mask_b', learning_rate=2., regularizer=L2Decay(0.))) name='conv5_mask_b', learning_rate=2., regularizer=L2Decay(0.)))
act_func = None act_func = None
if not self.is_train: if self.mode != 'train':
act_func = 'sigmoid' act_func = 'sigmoid'
mask_fcn_logits = fluid.layers.conv2d( mask_fcn_logits = fluid.layers.conv2d(
input=mask_out, input=mask_out,
...@@ -325,13 +327,13 @@ class RCNN(object): ...@@ -325,13 +327,13 @@ class RCNN(object):
learning_rate=2., learning_rate=2.,
regularizer=L2Decay(0.))) regularizer=L2Decay(0.)))
if not self.is_train: if self.mode != 'train':
mask_fcn_logits = fluid.layers.lod_reset(mask_fcn_logits, mask_fcn_logits = fluid.layers.lod_reset(mask_fcn_logits,
self.pred_result) self.pred_result)
return mask_fcn_logits return mask_fcn_logits
def mask_rcnn_heads(self, mask_input): def mask_rcnn_heads(self, mask_input):
if self.is_train: if self.mode == 'train':
conv5 = fluid.layers.gather(self.res5_2_sum, conv5 = fluid.layers.gather(self.res5_2_sum,
self.roi_has_mask_int32) self.roi_has_mask_int32)
self.mask_fcn_logits = self.SuffixNet(conv5) self.mask_fcn_logits = self.SuffixNet(conv5)
......
...@@ -36,7 +36,9 @@ def roidb_reader(roidb, mode): ...@@ -36,7 +36,9 @@ def roidb_reader(roidb, mode):
im_height = np.round(roidb['height'] * im_scales) im_height = np.round(roidb['height'] * im_scales)
im_width = np.round(roidb['width'] * im_scales) im_width = np.round(roidb['width'] * im_scales)
im_info = np.array([im_height, im_width, im_scales], dtype=np.float32) im_info = np.array([im_height, im_width, im_scales], dtype=np.float32)
if mode == 'val' or mode == 'infer': if mode == 'infer':
return im, im_info
if mode == 'val':
return im, im_info, im_id return im, im_info, im_id
gt_boxes = roidb['gt_boxes'].astype('float32') gt_boxes = roidb['gt_boxes'].astype('float32')
...@@ -162,8 +164,8 @@ def coco(mode, ...@@ -162,8 +164,8 @@ def coco(mode,
for roidb in roidbs: for roidb in roidbs:
if cfg.image_name not in roidb['image']: if cfg.image_name not in roidb['image']:
continue continue
im, im_info, im_id = roidb_reader(roidb, mode) im, im_info = roidb_reader(roidb, mode)
batch_out = [(im, im_info, im_id)] batch_out = [(im, im_info)]
yield batch_out yield batch_out
return reader return reader
......
...@@ -13,22 +13,26 @@ import net ...@@ -13,22 +13,26 @@ import net
SEED = 102 SEED = 102
def parse_args(): def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.") parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument( parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file address') '--train_dir',
parser.add_argument( type=str,
'--vocab_path', type=str, default='vocab.txt', help='vocab file address') default='train_data',
parser.add_argument( help='train file address')
'--is_local', type=int, default=1, help='whether local')
parser.add_argument( parser.add_argument(
'--hid_size', type=int, default=100, help='hid size') '--vocab_path',
type=str,
default='vocab.txt',
help='vocab file address')
parser.add_argument('--is_local', type=int, default=1, help='whether local')
parser.add_argument('--hid_size', type=int, default=100, help='hid size')
parser.add_argument( parser.add_argument(
'--model_dir', type=str, default='model_recall20', help='model dir') '--model_dir', type=str, default='model_recall20', help='model dir')
parser.add_argument( parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size') '--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument( parser.add_argument('--pass_num', type=int, default=10, help='num of epoch')
'--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument( parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch') '--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument( parser.add_argument(
...@@ -40,19 +44,33 @@ def parse_args(): ...@@ -40,19 +44,33 @@ def parse_args():
parser.add_argument( parser.add_argument(
'--role', type=str, default='pserver', help='trainer or pserver') '--role', type=str, default='pserver', help='trainer or pserver')
parser.add_argument( parser.add_argument(
'--endpoints', type=str, default='127.0.0.1:6000', help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001') '--endpoints',
parser.add_argument( type=str,
'--current_endpoint', type=str, default='127.0.0.1:6000', help='The current_endpoint') default='127.0.0.1:6000',
parser.add_argument( help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
'--trainer_id', type=int, default=0, help='trainer id ,only trainer_id=0 save model') parser.add_argument(
parser.add_argument( '--current_endpoint',
'--trainers', type=int, default=1, help='The num of trianers, (default: 1)') type=str,
default='127.0.0.1:6000',
help='The current_endpoint')
parser.add_argument(
'--trainer_id',
type=int,
default=0,
help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers',
type=int,
default=1,
help='The num of trianers, (default: 1)')
args = parser.parse_args() args = parser.parse_args()
return args return args
def get_cards(args): def get_cards(args):
return args.num_devices return args.num_devices
def train(): def train():
""" do training """ """ do training """
args = parse_args() args = parse_args()
...@@ -67,7 +85,8 @@ def train(): ...@@ -67,7 +85,8 @@ def train():
buffer_size=1000, word_freq_threshold=0, is_train=True) buffer_size=1000, word_freq_threshold=0, is_train=True)
# Train program # Train program
src_wordseq, dst_wordseq, avg_cost, acc = net.network(vocab_size=vocab_size, hid_size=hid_size) src_wordseq, dst_wordseq, avg_cost, acc = net.all_vocab_network(
vocab_size=vocab_size, hid_size=hid_size)
# Optimization to minimize lost # Optimization to minimize lost
sgd_optimizer = fluid.optimizer.SGD(learning_rate=args.base_lr) sgd_optimizer = fluid.optimizer.SGD(learning_rate=args.base_lr)
...@@ -97,8 +116,10 @@ def train(): ...@@ -97,8 +116,10 @@ def train():
lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data], lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
place) place)
ret_avg_cost = exe.run(main_program, ret_avg_cost = exe.run(main_program,
feed={ "src_wordseq": lod_src_wordseq, feed={
"dst_wordseq": lod_dst_wordseq}, "src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq
},
fetch_list=fetch_list) fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0]) avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl) newest_ppl = np.mean(avg_ppl)
...@@ -113,7 +134,8 @@ def train(): ...@@ -113,7 +134,8 @@ def train():
feed_var_names = ["src_wordseq", "dst_wordseq"] feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost, acc] fetch_vars = [avg_cost, acc]
if args.trainer_id == 0: if args.trainer_id == 0:
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe) fluid.io.save_inference_model(save_dir, feed_var_names,
fetch_vars, exe)
print("model saved in %s" % save_dir) print("model saved in %s" % save_dir)
print("finish training") print("finish training")
...@@ -123,7 +145,8 @@ def train(): ...@@ -123,7 +145,8 @@ def train():
else: else:
print("run distribute training") print("run distribute training")
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers) t.transpile(
args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
if args.role == "pserver": if args.role == "pserver":
print("run psever") print("run psever")
pserver_prog = t.get_pserver_program(args.current_endpoint) pserver_prog = t.get_pserver_program(args.current_endpoint)
...@@ -136,5 +159,6 @@ def train(): ...@@ -136,5 +159,6 @@ def train():
print("run trainer") print("run trainer")
train_loop(t.get_trainer_program()) train_loop(t.get_trainer_program())
if __name__ == "__main__": if __name__ == "__main__":
train() train()
...@@ -11,6 +11,7 @@ import paddle ...@@ -11,6 +11,7 @@ import paddle
import utils import utils
def parse_args(): def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.") parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument( parser.add_argument(
...@@ -22,12 +23,15 @@ def parse_args(): ...@@ -22,12 +23,15 @@ def parse_args():
parser.add_argument( parser.add_argument(
'--model_dir', type=str, default='model_recall20', help='model dir') '--model_dir', type=str, default='model_recall20', help='model dir')
parser.add_argument( parser.add_argument(
'--use_cuda', type=int, default='1', help='whether use cuda') '--use_cuda', type=int, default='0', help='whether use cuda')
parser.add_argument( parser.add_argument(
'--batch_size', type=int, default='5', help='batch_size') '--batch_size', type=int, default='5', help='batch_size')
parser.add_argument(
'--vocab_path', type=str, default='vocab.txt', help='vocab file')
args = parser.parse_args() args = parser.parse_args()
return args return args
def infer(test_reader, use_cuda, model_path): def infer(test_reader, use_cuda, model_path):
""" inference function """ """ inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
...@@ -72,11 +76,16 @@ if __name__ == "__main__": ...@@ -72,11 +76,16 @@ if __name__ == "__main__":
test_dir = args.test_dir test_dir = args.test_dir
model_dir = args.model_dir model_dir = args.model_dir
batch_size = args.batch_size batch_size = args.batch_size
vocab_path = args.vocab_path
use_cuda = True if args.use_cuda else False use_cuda = True if args.use_cuda else False
print("start index: ", start_index, " last_index:" ,last_index) print("start index: ", start_index, " last_index:", last_index)
vocab_size, test_reader = utils.prepare_data( vocab_size, test_reader = utils.prepare_data(
test_dir, "", batch_size=batch_size, test_dir,
buffer_size=1000, word_freq_threshold=0, is_train=False) vocab_path,
batch_size=batch_size,
buffer_size=1000,
word_freq_threshold=0,
is_train=False)
for epoch in range(start_index, last_index + 1): for epoch in range(start_index, last_index + 1):
epoch_path = model_dir + "/epoch_" + str(epoch) epoch_path = model_dir + "/epoch_" + str(epoch)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册