未验证 提交 eb32db18 编写于 作者: Z zhengya01 提交者: GitHub

Merge pull request #14 from PaddlePaddle/develop

update
......@@ -39,6 +39,8 @@ You can test if distributed training works on a single node before deploying to
***NOTE: for best performance, we recommend using multi-process mode, see No.3. And together with fp16.***
***NOTE: for nccl2 distributed mode, you must ensure each node train same number of samples, or set skip_unbalanced_data to 1 to do sync training.***
1. simply run `python dist_train.py` to start local training with default configuratioins.
2. for pserver mode, run `bash run_ps_mode.sh` to start 2 pservers and 2 trainers, these 2 trainers
will use GPU 0 and 1 to simulate 2 workers.
......@@ -90,4 +92,19 @@ The default resnet50 distributed training config is based on this paper: https:/
### Performance
TBD
The below figure shows fluid distributed training performances. We did these on a 4-node V100 GPU cluster,
each has 8 V100 GPU card, with total of 32 GPUs. All modes can reach the "state of the art (choose loss scale carefully when using fp16 mode)" of ResNet50 model with imagenet dataset. The Y axis in the figure shows
the images/s while the X-axis shows the number of GPUs.
<p align="center">
<img src="../images/imagenet_dist_performance.png" width=528> <br />
Performance of Multiple-GPU Training of Resnet50 on Imagenet
</p>
The second figure shows speed-ups when using multiple GPUs according to the above figure.
<p align="center">
<img src="../images/imagenet_dist_speedup.png" width=528> <br />
Speed-ups of Multiple-GPU Training of Resnet50 on Imagenet
</p>
......@@ -55,7 +55,7 @@ def eval():
add_conv_body_func=resnet.add_ResNet50_conv4_body,
add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
use_pyreader=False,
is_train=False)
mode='val')
model.build_model(image_shape)
pred_boxes = model.eval_bbox_out()
if cfg.MASK_ON:
......
......@@ -35,7 +35,7 @@ def infer():
add_conv_body_func=resnet.add_ResNet50_conv4_body,
add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
use_pyreader=False,
is_train=False)
mode='infer')
model.build_model(image_shape)
pred_boxes = model.eval_bbox_out()
if cfg.MASK_ON:
......
......@@ -25,12 +25,12 @@ class RCNN(object):
def __init__(self,
add_conv_body_func=None,
add_roi_box_head_func=None,
is_train=True,
mode='train',
use_pyreader=True,
use_random=True):
self.add_conv_body_func = add_conv_body_func
self.add_roi_box_head_func = add_roi_box_head_func
self.is_train = is_train
self.mode = mode
self.use_pyreader = use_pyreader
self.use_random = use_random
......@@ -41,7 +41,7 @@ class RCNN(object):
self.rpn_heads(body_conv)
# Fast RCNN
self.fast_rcnn_heads(body_conv)
if not self.is_train:
if self.mode != 'train':
self.eval_bbox()
# Mask RCNN
if cfg.MASK_ON:
......@@ -115,7 +115,9 @@ class RCNN(object):
name='gt_masks', shape=[2], dtype='float32', lod_level=3)
def feeds(self):
if not self.is_train:
if self.mode == 'infer':
return [self.image, self.im_info]
if self.mode == 'val':
return [self.image, self.im_info, self.im_id]
if not cfg.MASK_ON:
return [
......@@ -213,7 +215,7 @@ class RCNN(object):
rpn_cls_score_prob = fluid.layers.sigmoid(
self.rpn_cls_score, name='rpn_cls_score_prob')
param_obj = cfg.TRAIN if self.is_train else cfg.TEST
param_obj = cfg.TRAIN if self.mode == 'train' else cfg.TEST
pre_nms_top_n = param_obj.rpn_pre_nms_top_n
post_nms_top_n = param_obj.rpn_post_nms_top_n
nms_thresh = param_obj.rpn_nms_thresh
......@@ -230,7 +232,7 @@ class RCNN(object):
nms_thresh=nms_thresh,
min_size=min_size,
eta=eta)
if self.is_train:
if self.mode == 'train':
outs = fluid.layers.generate_proposal_labels(
rpn_rois=self.rpn_rois,
gt_classes=self.gt_label,
......@@ -267,7 +269,7 @@ class RCNN(object):
self.mask_int32 = mask_out[2]
def fast_rcnn_heads(self, roi_input):
if self.is_train:
if self.mode == 'train':
pool_rois = self.rois
else:
pool_rois = self.rpn_rois
......@@ -311,7 +313,7 @@ class RCNN(object):
bias_attr=ParamAttr(
name='conv5_mask_b', learning_rate=2., regularizer=L2Decay(0.)))
act_func = None
if not self.is_train:
if self.mode != 'train':
act_func = 'sigmoid'
mask_fcn_logits = fluid.layers.conv2d(
input=mask_out,
......@@ -325,13 +327,13 @@ class RCNN(object):
learning_rate=2.,
regularizer=L2Decay(0.)))
if not self.is_train:
if self.mode != 'train':
mask_fcn_logits = fluid.layers.lod_reset(mask_fcn_logits,
self.pred_result)
return mask_fcn_logits
def mask_rcnn_heads(self, mask_input):
if self.is_train:
if self.mode == 'train':
conv5 = fluid.layers.gather(self.res5_2_sum,
self.roi_has_mask_int32)
self.mask_fcn_logits = self.SuffixNet(conv5)
......
......@@ -36,7 +36,9 @@ def roidb_reader(roidb, mode):
im_height = np.round(roidb['height'] * im_scales)
im_width = np.round(roidb['width'] * im_scales)
im_info = np.array([im_height, im_width, im_scales], dtype=np.float32)
if mode == 'val' or mode == 'infer':
if mode == 'infer':
return im, im_info
if mode == 'val':
return im, im_info, im_id
gt_boxes = roidb['gt_boxes'].astype('float32')
......@@ -162,8 +164,8 @@ def coco(mode,
for roidb in roidbs:
if cfg.image_name not in roidb['image']:
continue
im, im_info, im_id = roidb_reader(roidb, mode)
batch_out = [(im, im_info, im_id)]
im, im_info = roidb_reader(roidb, mode)
batch_out = [(im, im_info)]
yield batch_out
return reader
......
......@@ -13,22 +13,26 @@ import net
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--train_dir', type=str, default='train_data', help='train file address')
parser.add_argument(
'--vocab_path', type=str, default='vocab.txt', help='vocab file address')
parser.add_argument(
'--is_local', type=int, default=1, help='whether local')
'--train_dir',
type=str,
default='train_data',
help='train file address')
parser.add_argument(
'--hid_size', type=int, default=100, help='hid size')
'--vocab_path',
type=str,
default='vocab.txt',
help='vocab file address')
parser.add_argument('--is_local', type=int, default=1, help='whether local')
parser.add_argument('--hid_size', type=int, default=100, help='hid size')
parser.add_argument(
'--model_dir', type=str, default='model_recall20', help='model dir')
parser.add_argument(
'--batch_size', type=int, default=5, help='num of batch size')
parser.add_argument(
'--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument('--pass_num', type=int, default=10, help='num of epoch')
parser.add_argument(
'--print_batch', type=int, default=10, help='num of print batch')
parser.add_argument(
......@@ -40,19 +44,33 @@ def parse_args():
parser.add_argument(
'--role', type=str, default='pserver', help='trainer or pserver')
parser.add_argument(
'--endpoints', type=str, default='127.0.0.1:6000', help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
parser.add_argument(
'--current_endpoint', type=str, default='127.0.0.1:6000', help='The current_endpoint')
parser.add_argument(
'--trainer_id', type=int, default=0, help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers', type=int, default=1, help='The num of trianers, (default: 1)')
'--endpoints',
type=str,
default='127.0.0.1:6000',
help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
parser.add_argument(
'--current_endpoint',
type=str,
default='127.0.0.1:6000',
help='The current_endpoint')
parser.add_argument(
'--trainer_id',
type=int,
default=0,
help='trainer id ,only trainer_id=0 save model')
parser.add_argument(
'--trainers',
type=int,
default=1,
help='The num of trianers, (default: 1)')
args = parser.parse_args()
return args
def get_cards(args):
return args.num_devices
def train():
""" do training """
args = parse_args()
......@@ -67,12 +85,13 @@ def train():
buffer_size=1000, word_freq_threshold=0, is_train=True)
# Train program
src_wordseq, dst_wordseq, avg_cost, acc = net.network(vocab_size=vocab_size, hid_size=hid_size)
src_wordseq, dst_wordseq, avg_cost, acc = net.all_vocab_network(
vocab_size=vocab_size, hid_size=hid_size)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.SGD(learning_rate=args.base_lr)
sgd_optimizer.minimize(avg_cost)
def train_loop(main_program):
""" train network """
pass_num = args.pass_num
......@@ -97,9 +116,11 @@ def train():
lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
place)
ret_avg_cost = exe.run(main_program,
feed={ "src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq},
fetch_list=fetch_list)
feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq
},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % args.print_batch == 0:
......@@ -113,7 +134,8 @@ def train():
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost, acc]
if args.trainer_id == 0:
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
fluid.io.save_inference_model(save_dir, feed_var_names,
fetch_vars, exe)
print("model saved in %s" % save_dir)
print("finish training")
......@@ -123,7 +145,8 @@ def train():
else:
print("run distribute training")
t = fluid.DistributeTranspiler()
t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
t.transpile(
args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
if args.role == "pserver":
print("run psever")
pserver_prog = t.get_pserver_program(args.current_endpoint)
......@@ -136,5 +159,6 @@ def train():
print("run trainer")
train_loop(t.get_trainer_program())
if __name__ == "__main__":
train()
......@@ -11,23 +11,27 @@ import paddle
import utils
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--test_dir', type=str, default='test_data', help='test file address')
parser.add_argument(
'--start_index', type=int, default='1', help='start index')
'--start_index', type=int, default='1', help='start index')
parser.add_argument(
'--last_index', type=int, default='10', help='end index')
parser.add_argument(
'--last_index', type=int, default='10', help='end index')
'--model_dir', type=str, default='model_recall20', help='model dir')
parser.add_argument(
'--model_dir', type=str, default='model_recall20', help='model dir')
'--use_cuda', type=int, default='0', help='whether use cuda')
parser.add_argument(
'--use_cuda', type=int, default='1', help='whether use cuda')
'--batch_size', type=int, default='5', help='batch_size')
parser.add_argument(
'--batch_size', type=int, default='5', help='batch_size')
'--vocab_path', type=str, default='vocab.txt', help='vocab file')
args = parser.parse_args()
return args
def infer(test_reader, use_cuda, model_path):
""" inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......@@ -72,11 +76,16 @@ if __name__ == "__main__":
test_dir = args.test_dir
model_dir = args.model_dir
batch_size = args.batch_size
vocab_path = args.vocab_path
use_cuda = True if args.use_cuda else False
print("start index: ", start_index, " last_index:" ,last_index)
print("start index: ", start_index, " last_index:", last_index)
vocab_size, test_reader = utils.prepare_data(
test_dir, "", batch_size=batch_size,
buffer_size=1000, word_freq_threshold=0, is_train=False)
test_dir,
vocab_path,
batch_size=batch_size,
buffer_size=1000,
word_freq_threshold=0,
is_train=False)
for epoch in range(start_index, last_index + 1):
epoch_path = model_dir + "/epoch_" + str(epoch)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册