diff --git a/.gitignore b/.gitignore index 9376aa940a6060e88d9b2415909292a95a15ca7a..61a80a88edb71e9ba4192f84ab7821ba139bb9ce 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ *.user *.pyc *~ +*.vscode diff --git a/fluid/PaddleCV/image_classification/fast_resnet/requirements.txt b/fluid/PaddleCV/image_classification/fast_resnet/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e13381c497be8ec79924595b4de747b56c3f444 --- /dev/null +++ b/fluid/PaddleCV/image_classification/fast_resnet/requirements.txt @@ -0,0 +1,3 @@ +torch==0.4.1 +torchvision +tqdm diff --git a/fluid/PaddleCV/image_classification/fast_resnet/torchvision_reader.py b/fluid/PaddleCV/image_classification/fast_resnet/torchvision_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..d5a7123b1e1e0343c4a9f06a926f82a9d855c1f6 --- /dev/null +++ b/fluid/PaddleCV/image_classification/fast_resnet/torchvision_reader.py @@ -0,0 +1,233 @@ +import os + +import numpy as np +import math +import random +import torch +import torch.utils.data +from torch.utils.data.distributed import DistributedSampler +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +from torch.utils.data.sampler import Sampler +import torchvision +import pickle +from tqdm import tqdm +import time +import multiprocessing + +TRAINER_NUMS = int(os.getenv("PADDLE_TRAINER_NUM", "1")) +TRAINER_ID = int(os.getenv("PADDLE_TRAINER_ID", "0")) +epoch = 0 + +FINISH_EVENT = "FINISH_EVENT" +#def paddle_data_loader(torch_dataset, indices=None, concurrent=1, queue_size=3072, use_uint8_reader=False): +class PaddleDataLoader(object): + def __init__(self, torch_dataset, indices=None, concurrent=16, queue_size=3072): + self.torch_dataset = torch_dataset + self.data_queue = multiprocessing.Queue(queue_size) + self.indices = indices + self.concurrent = concurrent + + def _worker_loop(self, dataset, worker_indices, worker_id): + cnt = 0 + for idx in worker_indices: + cnt += 1 + img, label = self.torch_dataset[idx] + img = np.array(img).astype('uint8').transpose((2, 0, 1)) + self.data_queue.put((img, label)) + print("worker: [%d] read [%d] samples. " % (worker_id, cnt)) + self.data_queue.put(FINISH_EVENT) + + def reader(self): + def _reader_creator(): + worker_processes = [] + total_img = len(self.torch_dataset) + print("total image: ", total_img) + if self.indices is None: + self.indices = [i for i in xrange(total_img)] + random.seed(time.time()) + random.shuffle(self.indices) + print("shuffle indices: %s ..." % self.indices[:10]) + + imgs_per_worker = int(math.ceil(total_img / self.concurrent)) + for i in xrange(self.concurrent): + start = i * imgs_per_worker + end = (i + 1) * imgs_per_worker if i != self.concurrent - 1 else None + sliced_indices = self.indices[start:end] + w = multiprocessing.Process( + target=self._worker_loop, + args=(self.torch_dataset, sliced_indices, i) + ) + w.daemon = True + w.start() + worker_processes.append(w) + finish_workers = 0 + worker_cnt = len(worker_processes) + while finish_workers < worker_cnt: + sample = self.data_queue.get() + if sample == FINISH_EVENT: + finish_workers += 1 + else: + yield sample + + return _reader_creator + +def train(traindir, sz, min_scale=0.08): + train_tfms = [ + transforms.RandomResizedCrop(sz, scale=(min_scale, 1.0)), + transforms.RandomHorizontalFlip() + ] + train_dataset = datasets.ImageFolder(traindir, transforms.Compose(train_tfms)) + return PaddleDataLoader(train_dataset).reader() + +def test(valdir, bs, sz, rect_val=False): + if rect_val: + idx_ar_sorted = sort_ar(valdir) + idx_sorted, _ = zip(*idx_ar_sorted) + idx2ar = map_idx2ar(idx_ar_sorted, bs) + + ar_tfms = [transforms.Resize(int(sz* 1.14)), CropArTfm(idx2ar, sz)] + val_dataset = ValDataset(valdir, transform=ar_tfms) + return PaddleDataLoader(val_dataset, concurrent=1, indices=idx_sorted).reader() + + val_tfms = [transforms.Resize(int(sz* 1.14)), transforms.CenterCrop(sz)] + val_dataset = datasets.ImageFolder(valdir, transforms.Compose(val_tfms)) + + return PaddleDataLoader(val_dataset).reader() + + + +def create_validation_set(valdir, batch_size, target_size, rect_val, distributed): + print("create_validation_set", valdir, batch_size, target_size, rect_val, distributed) + if rect_val: + idx_ar_sorted = sort_ar(valdir) + idx_sorted, _ = zip(*idx_ar_sorted) + idx2ar = map_idx2ar(idx_ar_sorted, batch_size) + + ar_tfms = [transforms.Resize(int(target_size * 1.14)), CropArTfm(idx2ar, target_size)] + val_dataset = ValDataset(valdir, transform=ar_tfms) + val_sampler = DistValSampler(idx_sorted, batch_size=batch_size, distributed=distributed) + return val_dataset, val_sampler + + val_tfms = [transforms.Resize(int(target_size * 1.14)), transforms.CenterCrop(target_size)] + val_dataset = datasets.ImageFolder(valdir, transforms.Compose(val_tfms)) + val_sampler = DistValSampler(list(range(len(val_dataset))), batch_size=batch_size, distributed=distributed) + return val_dataset, val_sampler + + +class ValDataset(datasets.ImageFolder): + def __init__(self, root, transform=None, target_transform=None): + super(ValDataset, self).__init__(root, transform, target_transform) + + def __getitem__(self, index): + path, target = self.imgs[index] + sample = self.loader(path) + if self.transform is not None: + for tfm in self.transform: + if isinstance(tfm, CropArTfm): + sample = tfm(sample, index) + else: + sample = tfm(sample) + if self.target_transform is not None: + target = self.target_transform(target) + + return sample, target + + +class DistValSampler(Sampler): + # DistValSampler distrbutes batches equally (based on batch size) to every gpu (even if there aren't enough images) + # WARNING: Some baches will contain an empty array to signify there aren't enough images + # Distributed=False - same validation happens on every single gpu + def __init__(self, indices, batch_size, distributed=True): + self.indices = indices + self.batch_size = batch_size + if distributed: + self.world_size = TRAINER_NUMS + self.global_rank = TRAINER_ID + else: + self.global_rank = 0 + self.world_size = 1 + + # expected number of batches per sample. Need this so each distributed gpu validates on same number of batches. + # even if there isn't enough data to go around + self.expected_num_batches = int(math.ceil(len(self.indices) / self.world_size / self.batch_size)) + + # num_samples = total images / world_size. This is what we distribute to each gpu + self.num_samples = self.expected_num_batches * self.batch_size + + def __iter__(self): + offset = self.num_samples * self.global_rank + sampled_indices = self.indices[offset:offset + self.num_samples] + print("DistValSampler: self.world_size: ", self.world_size, " self.global_rank: ", self.global_rank) + for i in range(self.expected_num_batches): + offset = i * self.batch_size + yield sampled_indices[offset:offset + self.batch_size] + + def __len__(self): + return self.expected_num_batches + + def set_epoch(self, epoch): + return + + +class CropArTfm(object): + def __init__(self, idx2ar, target_size): + self.idx2ar, self.target_size = idx2ar, target_size + + def __call__(self, img, idx): + target_ar = self.idx2ar[idx] + if target_ar < 1: + w = int(self.target_size / target_ar) + size = (w // 8 * 8, self.target_size) + else: + h = int(self.target_size * target_ar) + size = (self.target_size, h // 8 * 8) + return torchvision.transforms.functional.center_crop(img, size) + + +def sort_ar(valdir): + idx2ar_file = valdir + '/../sorted_idxar.p' + if os.path.isfile(idx2ar_file): + return pickle.load(open(idx2ar_file, 'rb')) + print('Creating AR indexes. Please be patient this may take a couple minutes...') + val_dataset = datasets.ImageFolder(valdir) # AS: TODO: use Image.open instead of looping through dataset + sizes = [img[0].size for img in tqdm(val_dataset, total=len(val_dataset))] + idx_ar = [(i, round(s[0] * 1.0/ s[1], 5)) for i, s in enumerate(sizes)] + sorted_idxar = sorted(idx_ar, key=lambda x: x[1]) + pickle.dump(sorted_idxar, open(idx2ar_file, 'wb')) + print('Done') + return sorted_idxar + +def chunks(l, n): + n = max(1, n) + return (l[i:i + n] for i in range(0, len(l), n)) + + +def map_idx2ar(idx_ar_sorted, batch_size): + ar_chunks = list(chunks(idx_ar_sorted, batch_size)) + idx2ar = {} + for chunk in ar_chunks: + idxs, ars = list(zip(*chunk)) + mean = round(np.mean(ars), 5) + for idx in idxs: + idx2ar[idx] = mean + return idx2ar + +if __name__ == "__main__": + #ds, sampler = create_validation_set("/data/imagenet/validation", 128, 288, True, True) + #for item in sampler: + # for idx in item: + # ds[idx] + + import time + test_reader = test(valdir="/data/imagenet/validation", bs=50, sz=288, rect_val=True) + start_ts = time.time() + for idx, data in enumerate(test_reader()): + print(idx, data[0].shape, data[1]) + if idx == 10: + break + if (idx + 1) % 1000 == 0: + cost = (time.time() - start_ts) + print("%d samples per second" % (1000 / cost)) + start_ts = time.time() \ No newline at end of file diff --git a/fluid/PaddleCV/image_classification/fast_resnet/train.py b/fluid/PaddleCV/image_classification/fast_resnet/train.py new file mode 100644 index 0000000000000000000000000000000000000000..2424bc045b56e38d4e9e44c11402756955cab8e4 --- /dev/null +++ b/fluid/PaddleCV/image_classification/fast_resnet/train.py @@ -0,0 +1,380 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import cProfile +import time +import os +import traceback + +import numpy as np + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.profiler as profiler +import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler + +import torchvision_reader +import sys +sys.path.append("..") +from utility import add_arguments, print_arguments +import functools +import models +import utils + +DEBUG_PROG = bool(os.getenv("DEBUG_PROG", "0")) +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + add_arg = functools.partial(add_arguments, argparser=parser) + # yapf: disable + add_arg('batch_size', int, 256, "Minibatch size.") + add_arg('use_gpu', bool, True, "Whether to use GPU or not.") + add_arg('total_images', int, 1281167, "Training image number.") + add_arg('num_epochs', int, 120, "number of epochs.") + add_arg('class_dim', int, 1000, "Class number.") + add_arg('image_shape', str, "3,224,224", "input image size") + add_arg('model_save_dir', str, "output", "model save directory") + add_arg('with_mem_opt', bool, False, "Whether to use memory optimization or not.") + add_arg('pretrained_model', str, None, "Whether to use pretrained model.") + add_arg('checkpoint', str, None, "Whether to resume checkpoint.") + add_arg('lr', float, 0.1, "set learning rate.") + add_arg('lr_strategy', str, "piecewise_decay", "Set the learning rate decay strategy.") + add_arg('model', str, "FastResNet", "Set the network to use.") + add_arg('data_dir', str, "./data/ILSVRC2012", "The ImageNet dataset root dir.") + add_arg('model_category', str, "models", "Whether to use models_name or not, valid value:'models','models_name'" ) + add_arg('fp16', bool, False, "Enable half precision training with fp16." ) + add_arg('scale_loss', float, 1.0, "Scale loss for fp16." ) + # for distributed + add_arg('start_test_pass', int, 0, "Start test after x passes.") + add_arg('num_threads', int, 8, "Use num_threads to run the fluid program.") + add_arg('reduce_strategy', str, "allreduce", "Choose from reduce or allreduce.") + add_arg('log_period', int, 5, "Print period, defualt is 5.") + add_arg('init_conv2d_kaiming', bool, False, "Whether to initliaze conv2d weight by kaiming.") + add_arg('memory_optimize', bool, True, "Whether to enable memory optimize.") + # yapf: enable + args = parser.parse_args() + return args + +def get_device_num(): + import subprocess + visible_device = os.getenv('CUDA_VISIBLE_DEVICES') + if visible_device: + device_num = len(visible_device.split(',')) + else: + device_num = subprocess.check_output( + ['nvidia-smi', '-L']).decode().count('\n') + return device_num + +def linear_lr_decay(lr_values, epochs, bs_values, total_images): + """Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + """ + from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter + import paddle.fluid.layers.tensor as tensor + import math + + with paddle.fluid.default_main_program()._lr_schedule_guard(): + global_step = _decay_step_counter() + + lr = tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") + with fluid.layers.control_flow.Switch() as switch: + last_steps = 0 + for idx, epoch_bound in enumerate(epochs): + start_epoch, end_epoch = epoch_bound + linear_epoch = end_epoch - start_epoch + start_lr, end_lr = lr_values[idx] + linear_lr = end_lr - start_lr + steps = last_steps + math.ceil(total_images * 1.0 / bs_values[idx]) * linear_epoch + linear_lr = end_lr = start_lr + with switch.case(global_step < steps): + decayed_lr = start_lr + linear_lr * ((global_step - last_steps) * 1.0/steps) + last_steps = steps + fluid.layers.tensor.assign(decayed_lr, lr) + last_value_var = tensor.fill_constant( + shape=[1], + dtype='float32', + value=float(lr_values[-1])) + with switch.default(): + fluid.layers.tensor.assign(last_value_var, lr) + + return lr + + + return decayed_lr +def test_parallel(exe, test_args, args, test_prog, feeder, bs): + acc_evaluators = [] + for i in xrange(len(test_args[2])): + acc_evaluators.append(fluid.metrics.Accuracy()) + + to_fetch = [v.name for v in test_args[2]] + test_reader = test_args[3] + batch_id = 0 + start_ts = time.time() + for batch_id, data in enumerate(test_reader()): + acc_rets = exe.run(fetch_list=to_fetch, feed=feeder.feed(data)) + ret_result = [np.mean(np.array(ret)) for ret in acc_rets] + print("Test batch: [%d], acc_rets: [%s]" % (batch_id, ret_result)) + for i, e in enumerate(acc_evaluators): + e.update( + value=np.array(acc_rets[i]), weight=bs) + num_samples = batch_id * bs * get_device_num() + print_train_time(start_ts, time.time(), num_samples) + + return [e.eval() for e in acc_evaluators] + +def build_program(args, is_train, main_prog, startup_prog, py_reader_startup_prog, img_size, trn_dir, batch_size, min_scale, rect_val): + + if is_train: + reader = torchvision_reader.train(traindir=os.path.join(args.data_dir, trn_dir, "train"), sz=img_size, min_scale=min_scale) + else: + reader = torchvision_reader.test(valdir=os.path.join(args.data_dir, trn_dir, "validation"), bs=batch_size * get_device_num(), sz=img_size, rect_val=rect_val) + dshape = [3, img_size, img_size] + class_dim = 1000 + + pyreader = None + batched_reader = None + model_name = args.model + model_list = [m for m in dir(models) if "__" not in m] + assert model_name in model_list, "{} is not in lists: {}".format(args.model, + model_list) + model = models.__dict__[model_name]() + with fluid.program_guard(main_prog, startup_prog): + with fluid.unique_name.guard(): + if is_train: + with fluid.program_guard(main_prog, py_reader_startup_prog): + with fluid.unique_name.guard(): + pyreader = fluid.layers.py_reader( + capacity=batch_size * get_device_num(), + shapes=([-1] + dshape, (-1, 1)), + dtypes=('uint8', 'int64'), + name="train_reader_" + str(img_size) if is_train else "test_reader_" + str(img_size), + use_double_buffer=True) + input, label = fluid.layers.read_file(pyreader) + pyreader.decorate_paddle_reader(paddle.batch(reader, batch_size=batch_size)) + else: + input = fluid.layers.data(name="image", shape=[3, 244, 244], dtype="uint8") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + batched_reader = paddle.batch(reader, batch_size=batch_size * get_device_num()) + cast_img_type = "float16" if args.fp16 else "float32" + cast = fluid.layers.cast(input, cast_img_type) + img_mean = fluid.layers.create_global_var([3, 1, 1], 0.0, cast_img_type, name="img_mean", persistable=True) + img_std = fluid.layers.create_global_var([3, 1, 1], 0.0, cast_img_type, name="img_std", persistable=True) + # image = (image - (mean * 255.0)) / (std * 255.0) + t1 = fluid.layers.elementwise_sub(cast, img_mean, axis=1) + t2 = fluid.layers.elementwise_div(t1, img_std, axis=1) + + predict = model.net(t2, class_dim=class_dim, img_size=img_size, is_train=is_train) + cost, pred = fluid.layers.softmax_with_cross_entropy(predict, label, return_softmax=True) + if args.scale_loss > 1: + avg_cost = fluid.layers.mean(x=cost) * float(args.scale_loss) + else: + avg_cost = fluid.layers.mean(x=cost) + + batch_acc1 = fluid.layers.accuracy(input=pred, label=label, k=1) + batch_acc5 = fluid.layers.accuracy(input=pred, label=label, k=5) + + # configure optimize + optimizer = None + if is_train: + #total_images = 1281167 / trainer_count + epochs = [(0,7), (7,13), (13, 22), (22, 25), (25, 28)] + bs_epoch = [x * get_device_num() for x in [224, 224, 96, 96, 50]] + lrs = [(1.0, 2.0), (2.0, 0.25), (0.42857142857142855, 0.04285714285714286), (0.04285714285714286, 0.004285714285714286), (0.0022321428571428575, 0.00022321428571428573), 0.00022321428571428573] + #boundaries, values = lr_decay(lrs, epochs, bs_epoch, total_images) + + #print("lr linear decay boundaries: ", boundaries, " \nvalues: ", values) + optimizer = fluid.optimizer.Momentum( + learning_rate=linear_lr_decay(lrs, epochs, bs_epoch, args.total_images), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + if args.fp16: + params_grads = optimizer.backward(avg_cost) + master_params_grads = utils.create_master_params_grads( + params_grads, main_prog, startup_prog, args.scale_loss) + optimizer.apply_gradients(master_params_grads) + utils.master_param_to_train_param(master_params_grads, params_grads, main_prog) + else: + optimizer.minimize(avg_cost) + + if args.memory_optimize: + fluid.memory_optimize(main_prog, skip_grads=True) + + return avg_cost, optimizer, [batch_acc1, + batch_acc5], batched_reader, pyreader, py_reader_startup_prog +def refresh_program(args, epoch, sz, trn_dir, bs, val_bs, need_update_start_prog=False, min_scale=0.08, rect_val=False): + print('program changed: epoch: [%d], image size: [%d], trn_dir: [%s], batch_size:[%d]' % (epoch, sz, trn_dir, bs)) + train_prog = fluid.Program() + test_prog = fluid.Program() + startup_prog = fluid.Program() + py_reader_startup_prog = fluid.Program() + + train_args = build_program(args, True, train_prog, startup_prog, py_reader_startup_prog, sz, trn_dir, bs, min_scale, False) + test_args = build_program(args, False, test_prog, startup_prog, py_reader_startup_prog, sz, trn_dir, val_bs, min_scale, rect_val) + + place = core.CUDAPlace(0) + startup_exe = fluid.Executor(place) + print("execute py_reader startup program") + startup_exe.run(py_reader_startup_prog) + + if need_update_start_prog: + print("execute startup program") + startup_exe.run(startup_prog) + if args.init_conv2d_kaiming: + import torch + conv2d_w_vars = [var for var in startup_prog.global_block().vars.values() if var.name.startswith('conv2d_')] + for var in conv2d_w_vars: + torch_w = torch.empty(var.shape) + kaiming_np = torch.nn.init.kaiming_normal_(torch_w, mode='fan_out', nonlinearity='relu').numpy() + tensor = fluid.global_scope().find_var(var.name).get_tensor() + if args.fp16: + tensor.set(np.array(kaiming_np, dtype="float16").view(np.uint16), place) + else: + tensor.set(np.array(kaiming_np, dtype="float32"), place) + + np_tensors = {} + np_tensors["img_mean"] = np.array([0.485 * 255.0, 0.456 * 255.0, 0.406 * 255.0]).astype("float16" if args.fp16 else "float32").reshape((3, 1, 1)) + np_tensors["img_std"] = np.array([0.229 * 255.0, 0.224 * 255.0, 0.225 * 255.0]).astype("float16" if args.fp16 else "float32").reshape((3, 1, 1)) + for vname, np_tensor in np_tensors.items(): + var = fluid.global_scope().find_var(vname) + if args.fp16: + var.get_tensor().set(np_tensor.view(np.uint16), place) + else: + var.get_tensor().set(np_tensor, place) + + + if DEBUG_PROG: + with open('/tmp/train_prog_pass%d' % epoch, 'w') as f: f.write(train_prog.to_string(True)) + with open('/tmp/test_prog_pass%d' % epoch, 'w') as f: f.write(test_prog.to_string(True)) + with open('/tmp/startup_prog_pass%d' % epoch, 'w') as f: f.write(startup_prog.to_string(True)) + with open('/tmp/py_reader_startup_prog_pass%d' % epoch, 'w') as f: f.write(py_reader_startup_prog.to_string(True)) + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = args.num_threads + strategy.allow_op_delay = False + build_strategy = fluid.BuildStrategy() + build_strategy.reduce_strategy = fluid.BuildStrategy().ReduceStrategy.AllReduce + + avg_loss = train_args[0] + train_exe = fluid.ParallelExecutor( + True, + avg_loss.name, + main_program=train_prog, + exec_strategy=strategy, + build_strategy=build_strategy) + test_exe = fluid.ParallelExecutor( + True, main_program=test_prog, share_vars_from=train_exe) + + return train_args, test_args, test_prog, train_exe, test_exe + +# NOTE: only need to benchmark using parallelexe +def train_parallel(args): + over_all_start = time.time() + test_prog = fluid.Program() + + exe = None + test_exe = None + train_args = None + test_args = None + bs = 224 + val_bs = 64 + for pass_id in range(args.num_epochs): + # program changed + if pass_id == 0: + train_args, test_args, test_prog, exe, test_exe = refresh_program(args, pass_id, sz=128, trn_dir="sz/160/", bs=bs, val_bs=val_bs, need_update_start_prog=True) + elif pass_id == 13: #13 + bs = 96 + train_args, test_args, test_prog, exe, test_exe = refresh_program(args, pass_id, sz=224, trn_dir="sz/352/", bs=bs, val_bs=val_bs, min_scale=0.087) + elif pass_id == 25: #25 + bs = 50 + val_bs=4 + train_args, test_args, test_prog, exe, test_exe = refresh_program(args, pass_id, sz=288, trn_dir="", bs=bs, val_bs=val_bs, min_scale=0.5, rect_val=True) + else: + pass + + avg_loss = train_args[0] + num_samples = 0 + iters = 0 + start_time = time.time() + train_args[4].start() # start pyreader + while True: + fetch_list = [avg_loss.name] + acc_name_list = [v.name for v in train_args[2]] + fetch_list.extend(acc_name_list) + fetch_list.append("learning_rate") + if iters % args.log_period == 0: + should_print = True + else: + should_print = False + + fetch_ret = [] + try: + if should_print: + fetch_ret = exe.run(fetch_list) + else: + exe.run([]) + except fluid.core.EOFException as eof: + print("Finish current epoch, will reset pyreader...") + train_args[4].reset() + break + except fluid.core.EnforceNotMet as ex: + traceback.print_exc() + exit(1) + + num_samples += bs * get_device_num() + + if should_print: + fetched_data = [np.mean(np.array(d)) for d in fetch_ret] + print("Pass %d, batch %d, loss %s, accucacys: %s, learning_rate %s, py_reader queue_size: %d" % + (pass_id, iters, fetched_data[0], fetched_data[1:-1], fetched_data[-1], train_args[4].queue.size())) + iters += 1 + + print_train_time(start_time, time.time(), num_samples) + feed_list = [test_prog.global_block().var(varname) for varname in ("image", "label")] + test_feeder = fluid.DataFeeder(feed_list=feed_list, place=fluid.CUDAPlace(0)) + test_ret = test_parallel(test_exe, test_args, args, test_prog, test_feeder, bs) + print("Pass: %d, Test Accuracy: %s, Spend %.2f hours\n" % + (pass_id, [np.mean(np.array(v)) for v in test_ret], (time.time() - over_all_start) / 3600)) + + print("total train time: ", time.time() - over_all_start) + +def print_train_time(start_time, end_time, num_samples): + train_elapsed = end_time - start_time + examples_per_sec = num_samples / train_elapsed + print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % + (num_samples, train_elapsed, examples_per_sec)) + + +def print_paddle_envs(): + print('----------- Configuration envs -----------') + for k in os.environ: + if "PADDLE_" in k: + print "ENV %s:%s" % (k, os.environ[k]) + print('------------------------------------------------') + + +def main(): + args = parse_args() + print_arguments(args) + print_paddle_envs() + train_parallel(args) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/fluid/PaddleCV/image_classification/models/__init__.py b/fluid/PaddleCV/image_classification/models/__init__.py index f05195f20110b0267de2980a60aa31ef848eb50e..8dedab8f14d345ddc6e2bc2b7fbf66eb6dfc50cc 100644 --- a/fluid/PaddleCV/image_classification/models/__init__.py +++ b/fluid/PaddleCV/image_classification/models/__init__.py @@ -9,3 +9,4 @@ from .inception_v4 import InceptionV4 from .se_resnext import SE_ResNeXt50_32x4d, SE_ResNeXt101_32x4d, SE_ResNeXt152_32x4d from .dpn import DPN68, DPN92, DPN98, DPN107, DPN131 from .shufflenet_v2 import ShuffleNetV2_x0_5, ShuffleNetV2_x1_0, ShuffleNetV2_x1_5, ShuffleNetV2_x2_0 +from .fast_resnet import FastResNet diff --git a/fluid/PaddleCV/image_classification/models/fast_resnet.py b/fluid/PaddleCV/image_classification/models/fast_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..4abd904dce3264c69f699344b24822eedd723f56 --- /dev/null +++ b/fluid/PaddleCV/image_classification/models/fast_resnet.py @@ -0,0 +1,170 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import numpy as np +import time +import os +import math + +import cProfile, pstats, StringIO + +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.profiler as profiler +import utils + +## visreader for imagenet +import torchvision_reader + +__all__ = ["FastResNet"] + +class FastResNet(): + def __init__(self, layers=50): + self.layers = layers + + def net(self, input, class_dim=1000, img_size=224, is_train=True): + layers = self.layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_filters = [64, 128, 256, 512] + + conv = self.conv_bn_layer( + input=input, num_filters=64, filter_size=7, stride=2, act='relu', is_train=is_train) + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + for block in range(len(depth)): + for i in range(depth[block]): + conv = self.bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1) + pool_size = int(img_size / 32) + pool = fluid.layers.pool2d( + input=conv, pool_size=pool_size, pool_type='avg', global_pooling=True) + out = fluid.layers.fc(input=pool, + size=class_dim, + act=None, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer(0.0, 0.01), + regularizer=fluid.regularizer.L2Decay(1e-4)), + bias_attr=fluid.ParamAttr( + regularizer=fluid.regularizer.L2Decay(1e-4))) + return out + + def conv_bn_layer(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None, + bn_init_value=1.0, + is_train=True): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.MSRAInitializer(), + regularizer=fluid.regularizer.L2Decay(1e-4))) + return fluid.layers.batch_norm(input=conv, act=act, is_test=not is_train, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Constant(bn_init_value), + regularizer=None)) + + def shortcut(self, input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1: + return self.conv_bn_layer(input, ch_out, 1, stride) + else: + return input + + def bottleneck_block(self, input, num_filters, stride): + conv0 = self.conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu') + # init bn-weight0 + conv2 = self.conv_bn_layer( + input=conv1, num_filters=num_filters * 4, filter_size=1, act=None, bn_init_value=0.0) + + short = self.shortcut(input, num_filters * 4, stride) + + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') + + +def _model_reader_dshape_classdim(args, is_train, val_bs=None, sz=224, trn_dir="", min_scale=0.08, rect_val=False): + reader = None + if args.data_set == "imagenet": + class_dim = 1000 + if args.data_format == 'NCHW': + dshape = [3, sz, sz] + else: + dshape = [sz, sz, 3] + if is_train: + reader = torchvision_reader.train( + traindir="/data/imagenet/%strain" % trn_dir, sz=sz, min_scale=min_scale) + else: + reader = torchvision_reader.test( + valdir="/data/imagenet/%svalidation" % trn_dir, bs=val_bs, sz=sz, rect_val=rect_val) + else: + raise ValueError("only support imagenet dataset.") + + return None, reader, dshape, class_dim + +def lr_decay(lrs, epochs, bs, total_image): + boundaries = [] + values = [] + import math + for idx, epoch in enumerate(epochs): + step = math.ceil(total_image * 1.0 / (bs[idx] * 8)) + ratio = (lrs[idx][1] - lrs[idx][0]) / (epoch[1] - epoch[0]) + lr_base = lrs[idx][0] + for s in xrange(epoch[0], epoch[1]): + if boundaries: + boundaries.append(boundaries[-1] + step) + else: + boundaries = [step] + values.append(lr_base + ratio * (s - epoch[0])) + values.append(lrs[-1]) + return boundaries, values +