提交 f59f9d9d 编写于 作者: Y Yancey1989

cleanup code

上级 fc36a9a0
...@@ -17,7 +17,7 @@ TRAINER_ID = int(os.getenv("PADDLE_TRAINER_ID", "0")) ...@@ -17,7 +17,7 @@ TRAINER_ID = int(os.getenv("PADDLE_TRAINER_ID", "0"))
FINISH_EVENT = "FINISH_EVENT" FINISH_EVENT = "FINISH_EVENT"
class PaddleDataLoader(object): class PaddleDataLoader(object):
def __init__(self, torch_dataset, indices=None, concurrent=16, queue_size=1024, shuffle=True, batch_size=224, is_distributed=True): def __init__(self, torch_dataset, indices=None, concurrent=16, queue_size=3072, shuffle=True, batch_size=224, is_distributed=True):
self.torch_dataset = torch_dataset self.torch_dataset = torch_dataset
self.data_queue = multiprocessing.Queue(queue_size) self.data_queue = multiprocessing.Queue(queue_size)
self.indices = indices self.indices = indices
...@@ -54,7 +54,7 @@ class PaddleDataLoader(object): ...@@ -54,7 +54,7 @@ class PaddleDataLoader(object):
offset = TRAINER_ID * cnt_per_node offset = TRAINER_ID * cnt_per_node
worker_indices = self.indices[offset: (offset + cnt_per_node)] worker_indices = self.indices[offset: (offset + cnt_per_node)]
if len(worker_indices) % self.batch_size != 0: if len(worker_indices) % self.batch_size != 0:
worker_indices += worker_indices[:(self.batch_size - (len(worker_indices) % self.batch_size))] worker_indices += worker_indices[-(self.batch_size - (len(worker_indices) % self.batch_size)):]
print("shuffle: [%d], shuffle seed: [%d], worker indices len: [%d], %s" % (self.shuffle, self.shuffle_seed, len(worker_indices), worker_indices[:10])) print("shuffle: [%d], shuffle seed: [%d], worker indices len: [%d], %s" % (self.shuffle, self.shuffle_seed, len(worker_indices), worker_indices[:10]))
cnt_per_thread = int(math.ceil(len(worker_indices) / self.concurrent)) cnt_per_thread = int(math.ceil(len(worker_indices) / self.concurrent))
......
...@@ -111,7 +111,7 @@ def linear_lr_decay(lr_values, epochs, bs_values, total_images): ...@@ -111,7 +111,7 @@ def linear_lr_decay(lr_values, epochs, bs_values, total_images):
linear_epoch = end_epoch - start_epoch linear_epoch = end_epoch - start_epoch
start_lr, end_lr = lr_values[idx] start_lr, end_lr = lr_values[idx]
linear_lr = end_lr - start_lr linear_lr = end_lr - start_lr
steps = last_steps + linear_epoch * total_images / bs_values[idx] steps = last_steps + linear_epoch * total_images / bs_values[idx] + 1
with switch.case(global_step < steps): with switch.case(global_step < steps):
decayed_lr = start_lr + linear_lr * ((global_step - last_steps)* 1.0/(steps - last_steps)) decayed_lr = start_lr + linear_lr * ((global_step - last_steps)* 1.0/(steps - last_steps))
last_steps = steps last_steps = steps
...@@ -167,6 +167,7 @@ def linear_lr_decay_by_epoch(lr_values, epochs, bs_values, total_images): ...@@ -167,6 +167,7 @@ def linear_lr_decay_by_epoch(lr_values, epochs, bs_values, total_images):
fluid.layers.tensor.assign(last_value_var, lr) fluid.layers.tensor.assign(last_value_var, lr)
return lr return lr
def test_parallel(exe, test_args, args, test_prog, feeder, bs): def test_parallel(exe, test_args, args, test_prog, feeder, bs):
acc_evaluators = [] acc_evaluators = []
for i in xrange(len(test_args[2])): for i in xrange(len(test_args[2])):
...@@ -234,12 +235,9 @@ def build_program(args, is_train, main_prog, startup_prog, py_reader_startup_pro ...@@ -234,12 +235,9 @@ def build_program(args, is_train, main_prog, startup_prog, py_reader_startup_pro
name="train_reader_" + str(img_size) if is_train else "test_reader_" + str(img_size), name="train_reader_" + str(img_size) if is_train else "test_reader_" + str(img_size),
use_double_buffer=True) use_double_buffer=True)
input, label = fluid.layers.read_file(pyreader) input, label = fluid.layers.read_file(pyreader)
#pyreader.decorate_paddle_reader(paddle.batch(imagenet_reader.train(os.path.join(args.data_dir, trn_dir, "train")), batch_size=batch_size))
#pyreader.decorate_paddle_reader(paddle.batch(dataloader.reader(), batch_size=batch_size))
else: else:
input = fluid.layers.data(name="image", shape=[3, 244, 244], dtype="uint8") input = fluid.layers.data(name="image", shape=[3, 244, 244], dtype="uint8")
label = fluid.layers.data(name="label", shape=[1], dtype="int64") label = fluid.layers.data(name="label", shape=[1], dtype="int64")
#batched_reader = paddle.batch(dataloader.reader(), batch_size=batch_size)
cast_img_type = "float16" if args.fp16 else "float32" cast_img_type = "float16" if args.fp16 else "float32"
cast = fluid.layers.cast(input, cast_img_type) cast = fluid.layers.cast(input, cast_img_type)
img_mean = fluid.layers.create_global_var([3, 1, 1], 0.0, cast_img_type, name="img_mean", persistable=True) img_mean = fluid.layers.create_global_var([3, 1, 1], 0.0, cast_img_type, name="img_mean", persistable=True)
...@@ -265,6 +263,7 @@ def build_program(args, is_train, main_prog, startup_prog, py_reader_startup_pro ...@@ -265,6 +263,7 @@ def build_program(args, is_train, main_prog, startup_prog, py_reader_startup_pro
bs_epoch = [x if is_mp_mode() else x * get_device_num() for x in [224, 224, 96, 96, 50]] bs_epoch = [x if is_mp_mode() else x * get_device_num() for x in [224, 224, 96, 96, 50]]
lrs = [(1.0, 2.0), (2.0, 0.25), (0.42857142857142855, 0.04285714285714286), (0.04285714285714286, 0.004285714285714286), (0.0022321428571428575, 0.00022321428571428573), 0.00022321428571428573] lrs = [(1.0, 2.0), (2.0, 0.25), (0.42857142857142855, 0.04285714285714286), (0.04285714285714286, 0.004285714285714286), (0.0022321428571428575, 0.00022321428571428573), 0.00022321428571428573]
images_per_worker = args.total_images / get_device_num() if is_mp_mode() else args.total_images images_per_worker = args.total_images / get_device_num() if is_mp_mode() else args.total_images
optimizer = fluid.optimizer.Momentum( optimizer = fluid.optimizer.Momentum(
learning_rate=linear_lr_decay_by_epoch(lrs, epochs, bs_epoch, images_per_worker), learning_rate=linear_lr_decay_by_epoch(lrs, epochs, bs_epoch, images_per_worker),
momentum=0.9, momentum=0.9,
...@@ -330,11 +329,10 @@ def refresh_program(args, epoch, sz, trn_dir, bs, val_bs, need_update_start_prog ...@@ -330,11 +329,10 @@ def refresh_program(args, epoch, sz, trn_dir, bs, val_bs, need_update_start_prog
else: else:
var.get_tensor().set(np_tensor, place) var.get_tensor().set(np_tensor, place)
strategy = fluid.ExecutionStrategy() strategy = fluid.ExecutionStrategy()
strategy.num_threads = args.num_threads strategy.num_threads = args.num_threads
strategy.allow_op_delay = False strategy.allow_op_delay = False
strategy.num_iteration_per_drop_scope = 30 strategy.num_iteration_per_drop_scope = 1
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy().ReduceStrategy.AllReduce build_strategy.reduce_strategy = fluid.BuildStrategy().ReduceStrategy.AllReduce
...@@ -389,6 +387,7 @@ def train_parallel(args): ...@@ -389,6 +387,7 @@ def train_parallel(args):
train_dataloader.shuffle_seed = pass_id + 1 train_dataloader.shuffle_seed = pass_id + 1
train_args[4].start() # start pyreader train_args[4].start() # start pyreader
batch_time_start = time.time() batch_time_start = time.time()
samples_per_step = bs if is_mp_mode() else bs * get_device_num()
while True: while True:
fetch_list = [avg_loss.name] fetch_list = [avg_loss.name]
acc_name_list = [v.name for v in train_args[2]] acc_name_list = [v.name for v in train_args[2]]
...@@ -412,13 +411,12 @@ def train_parallel(args): ...@@ -412,13 +411,12 @@ def train_parallel(args):
except fluid.core.EnforceNotMet as ex: except fluid.core.EnforceNotMet as ex:
traceback.print_exc() traceback.print_exc()
exit(1) exit(1)
num_samples += samples_per_step
num_samples += bs if is_mp_mode() else bs * get_device_num()
if should_print: if should_print:
fetched_data = [np.mean(np.array(d)) for d in fetch_ret] fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
print("Pass %d, batch %d, loss %s, accucacys: %s, learning_rate %s, py_reader queue_size: %d, avg batch time: %0.2f " % print("Pass %d, batch %d, loss %s, accucacys: %s, learning_rate %s, py_reader queue_size: %d, avg batch time: %0.4f secs" %
(pass_id, iters, fetched_data[0], fetched_data[1:-1], fetched_data[-1], train_args[4].queue.size(), (time.time() - batch_time_start) * 1.0 / bs )) (pass_id, iters, fetched_data[0], fetched_data[1:-1], fetched_data[-1], train_args[4].queue.size(), (time.time() - batch_time_start) * 1.0 / args.log_period ))
batch_time_start = time.time() batch_time_start = time.time()
iters += 1 iters += 1
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册