未验证 提交 e33dc481 编写于 作者: Z zhouzj 提交者: GitHub

remove with_data_parallel (#1658)

* remove with_data_parallel

* ACT adapts fleet

* ACT'demo adapts fleet

* fix bugs
上级 65c776de
...@@ -13,7 +13,6 @@ import paddle.static as static ...@@ -13,7 +13,6 @@ import paddle.static as static
from paddleslim.analysis import flops from paddleslim.analysis import flops
from paddleslim.nas import SANAS from paddleslim.nas import SANAS
from paddleslim.common import get_logger from paddleslim.common import get_logger
from optimizer import create_optimizer
import imagenet_reader import imagenet_reader
_logger = get_logger(__name__, level=logging.INFO) _logger = get_logger(__name__, level=logging.INFO)
...@@ -157,15 +156,13 @@ def search_mobilenetv2_block(config, args, image_size): ...@@ -157,15 +156,13 @@ def search_mobilenetv2_block(config, args, image_size):
build_strategy = static.BuildStrategy() build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram( train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel( train_program, build_strategy=build_strategy)
loss_name=avg_cost.name, build_strategy=build_strategy)
for epoch_id in range(args.retain_epoch): for epoch_id in range(args.retain_epoch):
for batch_id, data in enumerate(train_loader()): for batch_id, data in enumerate(train_loader()):
fetches = [avg_cost.name] fetches = [avg_cost.name]
s_time = time.time() s_time = time.time()
outs = exe.run(train_compiled_program, outs = exe.run(
feed=data, train_compiled_program, feed=data, fetch_list=fetches)[0]
fetch_list=fetches)[0]
batch_time = time.time() - s_time batch_time = time.time() - s_time
if batch_id % 10 == 0: if batch_id % 10 == 0:
_logger.info( _logger.info(
...@@ -175,9 +172,8 @@ def search_mobilenetv2_block(config, args, image_size): ...@@ -175,9 +172,8 @@ def search_mobilenetv2_block(config, args, image_size):
reward = [] reward = []
for batch_id, data in enumerate(val_loader()): for batch_id, data in enumerate(val_loader()):
test_fetches = [avg_cost.name, acc_top1.name, acc_top5.name] test_fetches = [avg_cost.name, acc_top1.name, acc_top5.name]
batch_reward = exe.run(test_program, batch_reward = exe.run(
feed=data, test_program, feed=data, fetch_list=test_fetches)
fetch_list=test_fetches)
reward_avg = np.mean(np.array(batch_reward), axis=1) reward_avg = np.mean(np.array(batch_reward), axis=1)
reward.append(reward_avg) reward.append(reward_avg)
......
...@@ -141,15 +141,13 @@ def search_mobilenetv2(config, args, image_size, is_server=True): ...@@ -141,15 +141,13 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
build_strategy = static.BuildStrategy() build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram( train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel( train_program, build_strategy=build_strategy)
loss_name=avg_cost.name, build_strategy=build_strategy)
for epoch_id in range(args.retain_epoch): for epoch_id in range(args.retain_epoch):
for batch_id, data in enumerate(train_loader()): for batch_id, data in enumerate(train_loader()):
fetches = [avg_cost.name] fetches = [avg_cost.name]
s_time = time.time() s_time = time.time()
outs = exe.run(train_compiled_program, outs = exe.run(
feed=data, train_compiled_program, feed=data, fetch_list=fetches)[0]
fetch_list=fetches)[0]
batch_time = time.time() - s_time batch_time = time.time() - s_time
if batch_id % 10 == 0: if batch_id % 10 == 0:
_logger.info( _logger.info(
...@@ -161,9 +159,8 @@ def search_mobilenetv2(config, args, image_size, is_server=True): ...@@ -161,9 +159,8 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
test_fetches = [ test_fetches = [
test_avg_cost.name, test_acc_top1.name, test_acc_top5.name test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
] ]
batch_reward = exe.run(test_program, batch_reward = exe.run(
feed=data, test_program, feed=data, fetch_list=test_fetches)
fetch_list=test_fetches)
reward_avg = np.mean(np.array(batch_reward), axis=1) reward_avg = np.mean(np.array(batch_reward), axis=1)
reward.append(reward_avg) reward.append(reward_avg)
......
...@@ -134,15 +134,13 @@ def search_mobilenetv2(config, args, image_size, is_server=True): ...@@ -134,15 +134,13 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
build_strategy = static.BuildStrategy() build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram( train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel( train_program, build_strategy=build_strategy)
loss_name=avg_cost.name, build_strategy=build_strategy)
for epoch_id in range(args.retain_epoch): for epoch_id in range(args.retain_epoch):
for batch_id, data in enumerate(train_loader()): for batch_id, data in enumerate(train_loader()):
fetches = [avg_cost.name] fetches = [avg_cost.name]
s_time = time.time() s_time = time.time()
outs = exe.run(train_compiled_program, outs = exe.run(
feed=data, train_compiled_program, feed=data, fetch_list=fetches)[0]
fetch_list=fetches)[0]
batch_time = time.time() - s_time batch_time = time.time() - s_time
if batch_id % 10 == 0: if batch_id % 10 == 0:
_logger.info( _logger.info(
...@@ -154,9 +152,8 @@ def search_mobilenetv2(config, args, image_size, is_server=True): ...@@ -154,9 +152,8 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
test_fetches = [ test_fetches = [
test_avg_cost.name, test_acc_top1.name, test_acc_top5.name test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
] ]
batch_reward = exe.run(test_program, batch_reward = exe.run(
feed=data, test_program, feed=data, fetch_list=test_fetches)
fetch_list=test_fetches)
reward_avg = np.mean(np.array(batch_reward), axis=1) reward_avg = np.mean(np.array(batch_reward), axis=1)
reward.append(reward_avg) reward.append(reward_avg)
...@@ -223,15 +220,13 @@ def test_search_result(tokens, image_size, args, config): ...@@ -223,15 +220,13 @@ def test_search_result(tokens, image_size, args, config):
build_strategy = static.BuildStrategy() build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram( train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel( train_program, build_strategy=build_strategy)
loss_name=avg_cost.name, build_strategy=build_strategy)
for epoch_id in range(args.retain_epoch): for epoch_id in range(args.retain_epoch):
for batch_id, data in enumerate(train_loader()): for batch_id, data in enumerate(train_loader()):
fetches = [avg_cost.name] fetches = [avg_cost.name]
s_time = time.time() s_time = time.time()
outs = exe.run(train_compiled_program, outs = exe.run(
feed=data, train_compiled_program, feed=data, fetch_list=fetches)[0]
fetch_list=fetches)[0]
batch_time = time.time() - s_time batch_time = time.time() - s_time
if batch_id % 10 == 0: if batch_id % 10 == 0:
_logger.info( _logger.info(
...@@ -243,9 +238,8 @@ def test_search_result(tokens, image_size, args, config): ...@@ -243,9 +238,8 @@ def test_search_result(tokens, image_size, args, config):
test_fetches = [ test_fetches = [
test_avg_cost.name, test_acc_top1.name, test_acc_top5.name test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
] ]
batch_reward = exe.run(test_program, batch_reward = exe.run(
feed=data, test_program, feed=data, fetch_list=test_fetches)
fetch_list=test_fetches)
reward_avg = np.mean(np.array(batch_reward), axis=1) reward_avg = np.mean(np.array(batch_reward), axis=1)
reward.append(reward_avg) reward.append(reward_avg)
......
...@@ -119,8 +119,8 @@ def train(main_prog, exe, epoch_id, train_loader, fetch_list, args): ...@@ -119,8 +119,8 @@ def train(main_prog, exe, epoch_id, train_loader, fetch_list, args):
[[drop_path_probility * epoch_id / args.retain_epoch] [[drop_path_probility * epoch_id / args.retain_epoch]
for i in range(args.batch_size)]).astype(np.float32) for i in range(args.batch_size)]).astype(np.float32)
drop_path_mask = 1 - np.random.binomial( drop_path_mask = 1 - np.random.binomial(
1, drop_path_prob[0], 1, drop_path_prob[0], size=[args.batch_size, 20, 4, 2
size=[args.batch_size, 20, 4, 2]).astype(np.float32) ]).astype(np.float32)
feed.append({ feed.append({
"image": image, "image": image,
"label": label, "label": label,
...@@ -195,8 +195,8 @@ def search(config, args, image_size, is_server=True): ...@@ -195,8 +195,8 @@ def search(config, args, image_size, is_server=True):
current_params = count_parameters_in_MB( current_params = count_parameters_in_MB(
train_program.global_block().all_parameters(), 'cifar10') train_program.global_block().all_parameters(), 'cifar10')
_logger.info('step: {}, current_params: {}M'.format(step, _logger.info(
current_params)) 'step: {}, current_params: {}M'.format(step, current_params))
if current_params > float(3.77): if current_params > float(3.77):
continue continue
...@@ -222,9 +222,7 @@ def search(config, args, image_size, is_server=True): ...@@ -222,9 +222,7 @@ def search(config, args, image_size, is_server=True):
build_strategy = static.BuildStrategy() build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram( train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel( train_program, build_strategy=build_strategy)
loss_name=train_fetch_list[0].name,
build_strategy=build_strategy)
valid_top1_list = [] valid_top1_list = []
for epoch_id in range(args.retain_epoch): for epoch_id in range(args.retain_epoch):
...@@ -234,8 +232,8 @@ def search(config, args, image_size, is_server=True): ...@@ -234,8 +232,8 @@ def search(config, args, image_size, is_server=True):
step, epoch_id, train_top1)) step, epoch_id, train_top1))
valid_top1 = valid(test_program, exe, epoch_id, test_loader, valid_top1 = valid(test_program, exe, epoch_id, test_loader,
test_fetch_list, args) test_fetch_list, args)
_logger.info("TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id, _logger.info(
valid_top1)) "TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id, valid_top1))
valid_top1_list.append(valid_top1) valid_top1_list.append(valid_top1)
sa_nas.reward(float(valid_top1_list[-1] + valid_top1_list[-2]) / 2) sa_nas.reward(float(valid_top1_list[-1] + valid_top1_list[-2]) / 2)
...@@ -276,19 +274,18 @@ def final_test(config, args, image_size, token=None): ...@@ -276,19 +274,18 @@ def final_test(config, args, image_size, token=None):
build_strategy = static.BuildStrategy() build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram( train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel( train_program, build_strategy=build_strategy)
loss_name=train_fetch_list[0].name, build_strategy=build_strategy)
valid_top1_list = [] valid_top1_list = []
for epoch_id in range(args.retain_epoch): for epoch_id in range(args.retain_epoch):
train_top1 = train(train_compiled_program, exe, epoch_id, train_loader, train_top1 = train(train_compiled_program, exe, epoch_id, train_loader,
train_fetch_list, args) train_fetch_list, args)
_logger.info("TRAIN: Epoch {}, train_acc {:.6f}".format(epoch_id, _logger.info(
train_top1)) "TRAIN: Epoch {}, train_acc {:.6f}".format(epoch_id, train_top1))
valid_top1 = valid(test_program, exe, epoch_id, test_loader, valid_top1 = valid(test_program, exe, epoch_id, test_loader,
test_fetch_list, args) test_fetch_list, args)
_logger.info("TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id, _logger.info(
valid_top1)) "TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id, valid_top1))
valid_top1_list.append(valid_top1) valid_top1_list.append(valid_top1)
output_dir = os.path.join('darts_output', str(epoch_id)) output_dir = os.path.join('darts_output', str(epoch_id))
......
...@@ -34,6 +34,7 @@ tar -xf MobileNetV1_pretrained.tar ...@@ -34,6 +34,7 @@ tar -xf MobileNetV1_pretrained.tar
通过以下命令启动裁剪任务: 通过以下命令启动裁剪任务:
- 单卡启动:
``` ```
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python train.py \ python train.py \
...@@ -43,9 +44,18 @@ python train.py \ ...@@ -43,9 +44,18 @@ python train.py \
--criterion "l1_norm" --criterion "l1_norm"
``` ```
其中,`model`用于指定待裁剪的模型。`pruned_ratio`用于指定各个卷积层通道数被裁剪的比例。`data`选项用于指定使用的数据集。 - 多卡启动:
`criterion` 选项用于指定所使用的剪裁算法策略,现在支持`l1_norm`, `bn_scale`, `geometry_median`。默认为`l1_norm`。可以 ```
设置该参数以改变剪裁算法策略。该目录下的四个shell脚本文件是在ResNet34, MobileNetV1, MobileNetV2等三个模型上进行的四组 export CUDA_VISIBLE_DEVICES=0, 1
python -m paddle.distributed.launch train.py \
--model "MobileNet" \
--pruned_ratio 0.31 \
--data "mnist" \
--criterion "l1_norm" \
--fleet
```
其中,`model`用于指定待裁剪的模型。`pruned_ratio`用于指定各个卷积层通道数被裁剪的比例。`data`选项用于指定使用的数据集。`criterion` 选项用于指定所使用的剪裁算法策略,现在支持`l1_norm`, `bn_scale`, `geometry_median`,默认为`l1_norm``fleet` 用于开启多卡训练,在多卡启动时需要调用该参数。该目录下的四个shell脚本文件是在ResNet34, MobileNetV1, MobileNetV2等三个模型上进行的四组
`criterion`设置为`geometry_median`的实验,可以直接运行脚本文件启动剪裁实验。 `criterion`设置为`geometry_median`的实验,可以直接运行脚本文件启动剪裁实验。
执行`python train.py --help`查看更多选项。 执行`python train.py --help`查看更多选项。
......
#!/bin/bash #!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1 export CUDA_VISIBLE_DEVICES=0,1
export FLAGS_fraction_of_gpu_memory_to_use=0.98 export FLAGS_fraction_of_gpu_memory_to_use=0.98
python train.py \ python -m paddle.distributed.launch train.py \
--model="MobileNet" \ --model="MobileNet" \
--pretrained_model="/workspace/models/MobileNetV1_pretrained" \ --pretrained_model="/workspace/models/MobileNetV1_pretrained" \
--data="imagenet" \ --data="imagenet" \
...@@ -14,4 +14,5 @@ python train.py \ ...@@ -14,4 +14,5 @@ python train.py \
--lr_strategy="piecewise_decay" \ --lr_strategy="piecewise_decay" \
--criterion="geometry_median" \ --criterion="geometry_median" \
--model_path="./fpgm_mobilenetv1_models" \ --model_path="./fpgm_mobilenetv1_models" \
--fleet \
2>&1 | tee fpgm_mobilenetv1_train.log 2>&1 | tee fpgm_mobilenetv1_train.log
#!/bin/bash #!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1 export CUDA_VISIBLE_DEVICES=0,1
export FLAGS_fraction_of_gpu_memory_to_use=0.98 export FLAGS_fraction_of_gpu_memory_to_use=0.98
python train.py \ python -m paddle.distributed.launch train.py \
--model="MobileNetV2" \ --model="MobileNetV2" \
--pretrained_model="/workspace/models/MobileNetV2_pretrained" \ --pretrained_model="/workspace/models/MobileNetV2_pretrained" \
--data="imagenet" \ --data="imagenet" \
...@@ -14,4 +14,5 @@ python train.py \ ...@@ -14,4 +14,5 @@ python train.py \
--lr_strategy="piecewise_decay" \ --lr_strategy="piecewise_decay" \
--criterion="geometry_median" \ --criterion="geometry_median" \
--model_path="./fpgm_mobilenetv2_models" \ --model_path="./fpgm_mobilenetv2_models" \
--fleet \
2>&1 | tee fpgm_mobilenetv2_train.log 2>&1 | tee fpgm_mobilenetv2_train.log
#!/bin/bash #!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0,1,2,3
export FLAGS_fraction_of_gpu_memory_to_use=0.98 export FLAGS_fraction_of_gpu_memory_to_use=0.98
python train.py \ python -m paddle.distributed.launch train.py \
--model="ResNet34" \ --model="ResNet34" \
--pretrained_model="/workspace/models/ResNet34_pretrained" \ --pretrained_model="/workspace/models/ResNet34_pretrained" \
--data="imagenet" \ --data="imagenet" \
...@@ -9,4 +9,5 @@ python train.py \ ...@@ -9,4 +9,5 @@ python train.py \
--lr_strategy="cosine_decay" \ --lr_strategy="cosine_decay" \
--criterion="geometry_median" \ --criterion="geometry_median" \
--model_path="./fpgm_resnet34_025_120_models" \ --model_path="./fpgm_resnet34_025_120_models" \
--fleet \
2>&1 | tee fpgm_resnet025_120_train.log 2>&1 | tee fpgm_resnet025_120_train.log
#!/bin/bash #!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1 export CUDA_VISIBLE_DEVICES=0,1
export FLAGS_fraction_of_gpu_memory_to_use=0.98 export FLAGS_fraction_of_gpu_memory_to_use=0.98
python train.py \ python -m paddle.distributed.launch train.py \
--model="ResNet34" \ --model="ResNet34" \
--pretrained_model="/workspace/models/ResNet34_pretrained" \ --pretrained_model="/workspace/models/ResNet34_pretrained" \
--data="imagenet" \ --data="imagenet" \
...@@ -14,4 +14,5 @@ python train.py \ ...@@ -14,4 +14,5 @@ python train.py \
--lr_strategy="piecewise_decay" \ --lr_strategy="piecewise_decay" \
--criterion="geometry_median" \ --criterion="geometry_median" \
--model_path="./fpgm_resnet34_models" \ --model_path="./fpgm_resnet34_models" \
--fleet \
2>&1 | tee fpgm_resnet03_train.log 2>&1 | tee fpgm_resnet03_train.log
...@@ -15,6 +15,7 @@ from paddleslim.analysis import flops ...@@ -15,6 +15,7 @@ from paddleslim.analysis import flops
import models import models
from utility import add_arguments, print_arguments from utility import add_arguments, print_arguments
import paddle.vision.transforms as T import paddle.vision.transforms as T
from paddle.distributed import fleet
_logger = get_logger(__name__, level=logging.INFO) _logger = get_logger(__name__, level=logging.INFO)
...@@ -40,6 +41,7 @@ add_arg('pruned_ratio', float, None, "The ratios to be pruned.") ...@@ -40,6 +41,7 @@ add_arg('pruned_ratio', float, None, "The ratios to be pruned.")
add_arg('criterion', str, "l1_norm", "The prune criterion to be used, support l1_norm and batch_norm_scale.") add_arg('criterion', str, "l1_norm", "The prune criterion to be used, support l1_norm and batch_norm_scale.")
add_arg('save_inference', bool, False, "Whether to save inference model.") add_arg('save_inference', bool, False, "Whether to save inference model.")
add_arg('ce_test', bool, False, "Whether to CE test.") add_arg('ce_test', bool, False, "Whether to CE test.")
parser.add_argument('fleet', action='store_true', help="Whether to turn on distributed training.")
# yapf: enable # yapf: enable
model_list = models.__all__ model_list = models.__all__
...@@ -96,6 +98,8 @@ def create_optimizer(args, step_per_epoch): ...@@ -96,6 +98,8 @@ def create_optimizer(args, step_per_epoch):
def compress(args): def compress(args):
if args.fleet:
fleet.init(is_collective=True)
num_workers = 4 num_workers = 4
shuffle = True shuffle = True
...@@ -130,8 +134,8 @@ def compress(args): ...@@ -130,8 +134,8 @@ def compress(args):
else: else:
raise ValueError("{} is not supported.".format(args.data)) raise ValueError("{} is not supported.".format(args.data))
image_shape = [int(m) for m in image_shape.split(",")] image_shape = [int(m) for m in image_shape.split(",")]
assert args.model in model_list, "{} is not in lists: {}".format(args.model, assert args.model in model_list, "{} is not in lists: {}".format(
model_list) args.model, model_list)
places = paddle.static.cuda_places( places = paddle.static.cuda_places(
) if args.use_gpu else paddle.static.cpu_places() ) if args.use_gpu else paddle.static.cpu_places()
place = places[0] place = places[0]
...@@ -140,13 +144,16 @@ def compress(args): ...@@ -140,13 +144,16 @@ def compress(args):
name='image', shape=[None] + image_shape, dtype='float32') name='image', shape=[None] + image_shape, dtype='float32')
label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
batch_size_per_card = int(args.batch_size / len(places)) batch_size_per_card = int(args.batch_size / len(places))
sampler = paddle.io.DistributedBatchSampler(
train_dataset,
shuffle=shuffle,
drop_last=True,
batch_size=batch_size_per_card)
train_loader = paddle.io.DataLoader( train_loader = paddle.io.DataLoader(
train_dataset, train_dataset,
places=places, places=places,
feed_list=[image, label], feed_list=[image, label],
drop_last=True, batch_sampler=sampler,
batch_size=batch_size_per_card,
shuffle=shuffle,
return_list=False, return_list=False,
use_shared_memory=True, use_shared_memory=True,
num_workers=num_workers) num_workers=num_workers)
...@@ -171,6 +178,8 @@ def compress(args): ...@@ -171,6 +178,8 @@ def compress(args):
acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)
val_program = paddle.static.default_main_program().clone(for_test=True) val_program = paddle.static.default_main_program().clone(for_test=True)
opt, learning_rate = create_optimizer(args, step_per_epoch) opt, learning_rate = create_optimizer(args, step_per_epoch)
if args.fleet:
opt = fleet.distributed_optimizer(opt)
opt.minimize(avg_cost) opt.minimize(avg_cost)
exe.run(paddle.static.default_startup_program()) exe.run(paddle.static.default_startup_program())
...@@ -180,8 +189,8 @@ def compress(args): ...@@ -180,8 +189,8 @@ def compress(args):
def if_exist(var): def if_exist(var):
return os.path.exists(os.path.join(args.pretrained_model, var.name)) return os.path.exists(os.path.join(args.pretrained_model, var.name))
_logger.info("Load pretrained model from {}".format( _logger.info(
args.pretrained_model)) "Load pretrained model from {}".format(args.pretrained_model))
paddle.static.load(paddle.static.default_main_program(), paddle.static.load(paddle.static.default_main_program(),
args.pretrained_model, exe) args.pretrained_model, exe)
...@@ -247,13 +256,10 @@ def compress(args): ...@@ -247,13 +256,10 @@ def compress(args):
place=place) place=place)
_logger.info("FLOPs after pruning: {}".format(flops(pruned_program))) _logger.info("FLOPs after pruning: {}".format(flops(pruned_program)))
build_strategy = paddle.static.BuildStrategy() if args.fleet:
exec_strategy = paddle.static.ExecutionStrategy() train_program = paddle.static.CompiledProgram(pruned_program)
train_program = paddle.static.CompiledProgram( else:
pruned_program).with_data_parallel( train_program = pruned_program
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
for i in range(args.num_epochs): for i in range(args.num_epochs):
train(i, train_program) train(i, train_program)
...@@ -268,8 +274,8 @@ def compress(args): ...@@ -268,8 +274,8 @@ def compress(args):
infer_model_path, [image], [out], infer_model_path, [image], [out],
exe, exe,
program=pruned_val_program) program=pruned_val_program)
_logger.info("Saved inference model into [{}]".format( _logger.info(
infer_model_path)) "Saved inference model into [{}]".format(infer_model_path))
def main(): def main():
......
...@@ -143,16 +143,7 @@ compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, a ...@@ -143,16 +143,7 @@ compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, a
### 关掉指定build策略 ### 关掉指定build策略
```
build_strategy = paddle.static.BuildStrategy()
build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False
exec_strategy = paddle.static.ExecutionStrategy()
compiled_train_prog = compiled_train_prog.with_data_parallel(
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
```
### 训练命令 ### 训练命令
...@@ -192,8 +183,10 @@ python train.py ...@@ -192,8 +183,10 @@ python train.py
多卡启动PACT量化训练: 多卡启动PACT量化训练:
``` ```
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 train.py --batch_size=64 CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 train.py --batch_size=64 --fleet
``` ```
> 多卡训练需要调用 `fleet` 参数。
输出结果为 输出结果为
``` ```
......
...@@ -19,6 +19,7 @@ from paddleslim.quant import quant_aware, quant_post, convert ...@@ -19,6 +19,7 @@ from paddleslim.quant import quant_aware, quant_post, convert
import models import models
from utility import add_arguments, print_arguments from utility import add_arguments, print_arguments
from paddle.common_ops_import import LayerHelper from paddle.common_ops_import import LayerHelper
from paddle.distributed import fleet
quantization_model_save_dir = './quantization_models/' quantization_model_save_dir = './quantization_models/'
_logger = get_logger(__name__, level=logging.INFO) _logger = get_logger(__name__, level=logging.INFO)
...@@ -68,6 +69,7 @@ add_arg('analysis', bool, False, ...@@ -68,6 +69,7 @@ add_arg('analysis', bool, False,
add_arg('onnx_format', bool, False, add_arg('onnx_format', bool, False,
"Whether use onnx format or not.") "Whether use onnx format or not.")
add_arg('ce_test', bool, False, "Whether to CE test.") add_arg('ce_test', bool, False, "Whether to CE test.")
parser.add_argument('fleet', action='store_true', help="Whether to turn on distributed training.")
# yapf: enable # yapf: enable
...@@ -120,6 +122,9 @@ def _prepare_envs(): ...@@ -120,6 +122,9 @@ def _prepare_envs():
def compress(args): def compress(args):
if args.fleet:
fleet.init(is_collective=True)
num_workers = 4 num_workers = 4
shuffle = True shuffle = True
if args.ce_test: if args.ce_test:
...@@ -166,20 +171,24 @@ def compress(args): ...@@ -166,20 +171,24 @@ def compress(args):
if not args.analysis: if not args.analysis:
learning_rate, opt = create_optimizer(args) learning_rate, opt = create_optimizer(args)
if args.fleet:
opt = fleet.distributed_optimizer(opt)
opt.minimize(avg_cost) opt.minimize(avg_cost)
exe, places = _prepare_envs() exe, places = _prepare_envs()
exe.run(paddle.static.default_startup_program()) exe.run(paddle.static.default_startup_program())
sampler = paddle.io.DistributedBatchSampler(
train_dataset,
shuffle=shuffle,
drop_last=True,
batch_size=args.batch_size)
train_loader = paddle.io.DataLoader( train_loader = paddle.io.DataLoader(
train_dataset, train_dataset,
places=places, places=places,
feed_list=[image, label], feed_list=[image, label],
drop_last=True,
return_list=False, return_list=False,
batch_size=args.batch_size, batch_sampler=sampler,
use_shared_memory=True, use_shared_memory=True,
shuffle=shuffle,
num_workers=num_workers) num_workers=num_workers)
valid_loader = paddle.io.DataLoader( valid_loader = paddle.io.DataLoader(
...@@ -379,22 +388,12 @@ def compress(args): ...@@ -379,22 +388,12 @@ def compress(args):
batch_id += 1 batch_id += 1
lr.step() lr.step()
build_strategy = paddle.static.BuildStrategy()
build_strategy.enable_inplace = False
build_strategy.fuse_all_reduce_ops = False
exec_strategy = paddle.static.ExecutionStrategy()
compiled_train_prog = compiled_train_prog.with_data_parallel(
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
# train loop # train loop
best_acc1 = 0.0 best_acc1 = 0.0
best_epoch = 0 best_epoch = 0
start_epoch = 0 start_epoch = 0
if args.checkpoint_dir is not None: if args.checkpoint_dir is not None:
ckpt_path = args.checkpoint_dir
assert args.checkpoint_epoch is not None, "checkpoint_epoch must be set" assert args.checkpoint_epoch is not None, "checkpoint_epoch must be set"
start_epoch = args.checkpoint_epoch start_epoch = args.checkpoint_epoch
paddle.static.load( paddle.static.load(
......
...@@ -52,18 +52,7 @@ val_program = quant_aware(val_program, place, quant_config, scope=None, for_test ...@@ -52,18 +52,7 @@ val_program = quant_aware(val_program, place, quant_config, scope=None, for_test
compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, for_test=False) compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, for_test=False)
``` ```
### 关掉指定build策略
```
build_strategy = paddle.static.BuildStrategy()
build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False
exec_strategy = paddle.static.ExecutionStrategy()
compiled_train_prog = compiled_train_prog.with_data_parallel(
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
```
### 训练命令 ### 训练命令
...@@ -81,7 +70,8 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log - ...@@ -81,7 +70,8 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log -
--model MobileNet \ --model MobileNet \
--pretrained_model ./pretrain/MobileNetV1_pretrained \ --pretrained_model ./pretrain/MobileNetV1_pretrained \
--checkpoint_dir ./output/mobilenetv1 \ --checkpoint_dir ./output/mobilenetv1 \
--num_epochs 30 --num_epochs 30 \
--fleet
``` ```
运行之后,可看到``best_model``的最后测试结果,和MobileNet量化前的精度top1=70.99%, top5=89.68%非常相近。 运行之后,可看到``best_model``的最后测试结果,和MobileNet量化前的精度top1=70.99%, top5=89.68%非常相近。
...@@ -15,6 +15,7 @@ from paddleslim.analysis import flops ...@@ -15,6 +15,7 @@ from paddleslim.analysis import flops
from paddleslim.quant import quant_aware, convert from paddleslim.quant import quant_aware, convert
import paddle.vision.transforms as T import paddle.vision.transforms as T
import models import models
from paddle.distributed import fleet
from utility import add_arguments, print_arguments from utility import add_arguments, print_arguments
quantization_model_save_dir = './quantization_models/' quantization_model_save_dir = './quantization_models/'
...@@ -41,6 +42,7 @@ add_arg('log_period', int, 10, "Log period in batches.") ...@@ -41,6 +42,7 @@ add_arg('log_period', int, 10, "Log period in batches.")
add_arg('checkpoint_dir', str, "output", "checkpoint save dir") add_arg('checkpoint_dir', str, "output", "checkpoint save dir")
add_arg('ce_test', bool, False, "Whether to CE test.") add_arg('ce_test', bool, False, "Whether to CE test.")
add_arg('onnx_format', bool, False, "Whether to export the quantized model with format of ONNX.") add_arg('onnx_format', bool, False, "Whether to export the quantized model with format of ONNX.")
parser.add_argument('fleet', action='store_true', help="Whether to turn on distributed training.")
# yapf: enable # yapf: enable
model_list = [m for m in dir(models) if "__" not in m] model_list = [m for m in dir(models) if "__" not in m]
...@@ -92,6 +94,9 @@ def _prepare_envs(): ...@@ -92,6 +94,9 @@ def _prepare_envs():
def compress(args): def compress(args):
if args.fleet:
fleet.init(is_collective=True)
num_workers = 4 num_workers = 4
shuffle = True shuffle = True
if args.ce_test: if args.ce_test:
...@@ -154,8 +159,8 @@ def compress(args): ...@@ -154,8 +159,8 @@ def compress(args):
raise ValueError("{} is not supported.".format(args.data)) raise ValueError("{} is not supported.".format(args.data))
image_shape = [int(m) for m in image_shape.split(",")] image_shape = [int(m) for m in image_shape.split(",")]
assert args.model in model_list, "{} is not in lists: {}".format(args.model, assert args.model in model_list, "{} is not in lists: {}".format(
model_list) args.model, model_list)
image = paddle.static.data( image = paddle.static.data(
name='image', shape=[None] + image_shape, dtype='float32') name='image', shape=[None] + image_shape, dtype='float32')
label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
...@@ -182,6 +187,8 @@ def compress(args): ...@@ -182,6 +187,8 @@ def compress(args):
compiled_train_prog = quant_aware( compiled_train_prog = quant_aware(
train_prog, places, quant_config, scope=None, for_test=False) train_prog, places, quant_config, scope=None, for_test=False)
opt = create_optimizer(args) opt = create_optimizer(args)
if args.fleet:
opt = fleet.distributed_optimizer(opt)
opt.minimize(avg_cost) opt.minimize(avg_cost)
exe.run(paddle.static.default_startup_program()) exe.run(paddle.static.default_startup_program())
...@@ -192,16 +199,18 @@ def compress(args): ...@@ -192,16 +199,18 @@ def compress(args):
if args.pretrained_model: if args.pretrained_model:
paddle.static.load(train_prog, args.pretrained_model, exe) paddle.static.load(train_prog, args.pretrained_model, exe)
sampler = paddle.io.DistributedBatchSampler(
train_dataset,
shuffle=shuffle,
drop_last=True,
batch_size=args.batch_size)
train_loader = paddle.io.DataLoader( train_loader = paddle.io.DataLoader(
train_dataset, train_dataset,
places=places, places=places,
feed_list=[image, label], feed_list=[image, label],
drop_last=True, batch_sampler=sampler,
batch_size=args.batch_size,
return_list=False, return_list=False,
use_shared_memory=True, use_shared_memory=True,
shuffle=shuffle,
num_workers=num_workers) num_workers=num_workers)
valid_loader = paddle.io.DataLoader( valid_loader = paddle.io.DataLoader(
val_dataset, val_dataset,
...@@ -257,17 +266,6 @@ def compress(args): ...@@ -257,17 +266,6 @@ def compress(args):
end_time - start_time)) end_time - start_time))
batch_id += 1 batch_id += 1
build_strategy = paddle.static.BuildStrategy()
build_strategy.memory_optimize = False
build_strategy.enable_inplace = False
build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False
exec_strategy = paddle.static.ExecutionStrategy()
compiled_train_prog = compiled_train_prog.with_data_parallel(
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
############################################################################################################ ############################################################################################################
# train loop # train loop
############################################################################################################ ############################################################################################################
......
...@@ -133,8 +133,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id, weight, ...@@ -133,8 +133,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id, weight,
if int(os.getenv("CPU_NUM")) > 1: if int(os.getenv("CPU_NUM")) > 1:
build_strategy.reduce_strategy = paddle.static.BuildStrategy.ReduceStrategy.Reduce build_strategy.reduce_strategy = paddle.static.BuildStrategy.ReduceStrategy.Reduce
program = paddle.static.CompiledProgram(train_program).with_data_parallel( program = paddle.static.CompiledProgram(
loss_name=loss.name, build_strategy=build_strategy) train_program, build_strategy=build_strategy)
for pass_id in range(args.num_passes): for pass_id in range(args.num_passes):
py_reader.start() py_reader.start()
......
...@@ -71,21 +71,25 @@ def test(test_exe, test_program, test_out, args): ...@@ -71,21 +71,25 @@ def test(test_exe, test_program, test_out, args):
for idx, data in enumerate(test_reader()): for idx, data in enumerate(test_reader()):
res = [] res = []
res.append( res.append(
test_exe.run(test_program, test_exe.run(
feed={u'image_test': data[0][u'image_test1']}, test_program,
fetch_list=out_feature)) feed={u'image_test': data[0][u'image_test1']},
fetch_list=out_feature))
res.append( res.append(
test_exe.run(test_program, test_exe.run(
feed={u'image_test': data[0][u'image_test2']}, test_program,
fetch_list=out_feature)) feed={u'image_test': data[0][u'image_test2']},
fetch_list=out_feature))
res.append( res.append(
test_exe.run(test_program, test_exe.run(
feed={u'image_test': data[0][u'image_test3']}, test_program,
fetch_list=out_feature)) feed={u'image_test': data[0][u'image_test3']},
fetch_list=out_feature))
res.append( res.append(
test_exe.run(test_program, test_exe.run(
feed={u'image_test': data[0][u'image_test4']}, test_program,
fetch_list=out_feature)) feed={u'image_test': data[0][u'image_test4']},
fetch_list=out_feature))
featureL = np.concatenate((res[0][0], res[1][0]), 1) featureL = np.concatenate((res[0][0], res[1][0]), 1)
featureR = np.concatenate((res[2][0], res[3][0]), 1) featureR = np.concatenate((res[2][0], res[3][0]), 1)
if featureLs is None: if featureLs is None:
...@@ -119,14 +123,12 @@ def train(exe, train_program, train_out, test_program, test_out, args): ...@@ -119,14 +123,12 @@ def train(exe, train_program, train_out, test_program, test_out, args):
build_strategy = paddle.static.BuildStrategy() build_strategy = paddle.static.BuildStrategy()
build_strategy.fuse_all_optimizer_ops = True build_strategy.fuse_all_optimizer_ops = True
compiled_prog = paddle.static.CompiledProgram( compiled_prog = paddle.static.CompiledProgram(
train_program, build_strategy=build_strategy).with_data_parallel( train_program, build_strategy=build_strategy)
loss_name=loss.name, build_strategy=build_strategy)
best_ave = 0 best_ave = 0
for epoch_id in range(args.start_epoch, args.total_epoch): for epoch_id in range(args.start_epoch, args.total_epoch):
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
loss, acc, global_lr = exe.run(compiled_prog, loss, acc, global_lr = exe.run(
feed=data, compiled_prog, feed=data, fetch_list=fetch_list_train)
fetch_list=fetch_list_train)
avg_loss = np.mean(np.array(loss)) avg_loss = np.mean(np.array(loss))
avg_acc = np.mean(np.array(acc)) avg_acc = np.mean(np.array(acc))
print( print(
......
...@@ -63,7 +63,7 @@ def _get_skip_params(program): ...@@ -63,7 +63,7 @@ def _get_skip_params(program):
skip_params.add(input.name()) skip_params.add(input.name())
for param in program.all_parameters(): for param in program.all_parameters():
if len(param.shape) == 1: if len(param.shape) == 1:
skip_params.add(param.name) skip_params.add(param.name)
return skip_params return skip_params
``` ```
...@@ -79,7 +79,7 @@ CUDA_VISIBLE_DEVICES=0 python train.py --batch_size 64 --data imagenet --lr 0.05 ...@@ -79,7 +79,7 @@ CUDA_VISIBLE_DEVICES=0 python train.py --batch_size 64 --data imagenet --lr 0.05
CUDA_VISIBLE_DEVICES=0 python train.py --batch_size 64 --data imagenet --lr 0.05 --pruning_mode ratio --ratio 0.55 CUDA_VISIBLE_DEVICES=0 python train.py --batch_size 64 --data imagenet --lr 0.05 --pruning_mode ratio --ratio 0.55
``` ```
GPU多卡训练:由于静态图多卡训练方式与非结构化稀疏中的mask逻辑存在兼容性问题,会在一定程度上影响训练精度,我们建议使用[Fleet](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/fleet_api_howto_cn.html)方式启动稀疏化多卡训练,实测精度与单卡一致。同时,为帮助开发者将`with_data_parallel`方式配置的分布式代码转换为`Fleet`我们在[示例代码](./train.py)里面也用`"Fleet step"`清晰标注出了用代码需要做的更改 GPU多卡训练:由于静态图多卡训练方式与非结构化稀疏中的mask逻辑存在兼容性问题,会在一定程度上影响训练精度,我们建议使用[Fleet](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/fleet_api_howto_cn.html)方式启动稀疏化多卡训练,[示例代码](./train.py)里面用`"Fleet step"`清晰标注出了设置流程
```bash ```bash
python -m paddle.distributed.launch \ python -m paddle.distributed.launch \
--selected_gpus="0,1,2,3" \ --selected_gpus="0,1,2,3" \
...@@ -135,7 +135,7 @@ for epoch in range(epochs): ...@@ -135,7 +135,7 @@ for epoch in range(epochs):
loss_n, acc_top1_n, acc_top5_n = exe.run( loss_n, acc_top1_n, acc_top5_n = exe.run(
train_program, train_program,
feed=data, feed=data,
fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name])
learning_rate.step() learning_rate.step()
#STEP2: update the pruner's threshold given the updated parameters #STEP2: update the pruner's threshold given the updated parameters
pruner.step() pruner.step()
......
...@@ -453,13 +453,6 @@ convert ...@@ -453,13 +453,6 @@ convert
#调用api #调用api
quant_train_program = quant.quant_aware(train_program, place, config, for_test=False) quant_train_program = quant.quant_aware(train_program, place, config, for_test=False)
quant_eval_program = quant.quant_aware(eval_program, place, config, for_test=True) quant_eval_program = quant.quant_aware(eval_program, place, config, for_test=True)
#关闭策略
build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False
quant_train_program = quant_train_program.with_data_parallel(
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
inference_prog = quant.convert(quant_eval_program, place, config) inference_prog = quant.convert(quant_eval_program, place, config)
......
...@@ -23,7 +23,7 @@ from tqdm import tqdm ...@@ -23,7 +23,7 @@ from tqdm import tqdm
import numpy as np import numpy as np
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
from paddle.io import DataLoader from paddle.io import DataLoader, DistributedBatchSampler
from imagenet_reader import ImageNetDataset from imagenet_reader import ImageNetDataset
from paddleslim.common import load_config as load_slim_config from paddleslim.common import load_config as load_slim_config
from paddleslim.auto_compression import AutoCompression from paddleslim.auto_compression import AutoCompression
...@@ -101,9 +101,10 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list): ...@@ -101,9 +101,10 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
if len(test_feed_names) == 1: if len(test_feed_names) == 1:
image = np.array(image) image = np.array(image)
label = np.array(label).astype('int64') label = np.array(label).astype('int64')
pred = exe.run(compiled_test_program, pred = exe.run(
feed={test_feed_names[0]: image}, compiled_test_program,
fetch_list=test_fetch_list) feed={test_feed_names[0]: image},
fetch_list=test_fetch_list)
pred = np.array(pred[0]) pred = np.array(pred[0])
label = np.array(label) label = np.array(label)
sort_array = pred.argsort(axis=1) sort_array = pred.argsort(axis=1)
...@@ -120,12 +121,11 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list): ...@@ -120,12 +121,11 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
# eval "eval model", which inputs are image and label, output is top1 and top5 accuracy # eval "eval model", which inputs are image and label, output is top1 and top5 accuracy
image = np.array(image) image = np.array(image)
label = np.array(label).astype('int64') label = np.array(label).astype('int64')
result = exe.run(compiled_test_program, result = exe.run(
feed={ compiled_test_program,
test_feed_names[0]: image, feed={test_feed_names[0]: image,
test_feed_names[1]: label test_feed_names[1]: label},
}, fetch_list=test_fetch_list)
fetch_list=test_fetch_list)
result = [np.mean(r) for r in result] result = [np.mean(r) for r in result]
results.append(result) results.append(result)
t.update() t.update()
...@@ -148,13 +148,13 @@ def main(): ...@@ -148,13 +148,13 @@ def main():
global_config = all_config["Global"] global_config = all_config["Global"]
gpu_num = paddle.distributed.get_world_size() gpu_num = paddle.distributed.get_world_size()
if isinstance(all_config['TrainConfig']['learning_rate'], if isinstance(
dict) and all_config['TrainConfig']['learning_rate'][ all_config['TrainConfig']['learning_rate'], dict
'type'] == 'CosineAnnealingDecay': ) and all_config['TrainConfig']['learning_rate']['type'] == 'CosineAnnealingDecay':
step = int( step = int(
math.ceil( math.ceil(
float(args.total_images) / (global_config['batch_size'] * float(args.total_images) / (
gpu_num))) global_config['batch_size'] * gpu_num)))
all_config['TrainConfig']['learning_rate']['T_max'] = step all_config['TrainConfig']['learning_rate']['T_max'] = step
print('total training steps:', step) print('total training steps:', step)
...@@ -171,13 +171,15 @@ def main(): ...@@ -171,13 +171,15 @@ def main():
data_dir=data_dir, data_dir=data_dir,
crop_size=img_size, crop_size=img_size,
resize_size=resize_size) resize_size=resize_size)
batch_sampler = DistributedBatchSampler(
train_loader = DataLoader(
train_dataset, train_dataset,
places=[place],
batch_size=global_config['batch_size'], batch_size=global_config['batch_size'],
shuffle=True, shuffle=True,
drop_last=True, drop_last=True)
train_loader = DataLoader(
train_dataset,
places=[place],
batch_sampler=batch_sampler,
num_workers=0) num_workers=0)
global_config['input_name'] = get_feed_vars( global_config['input_name'] = get_feed_vars(
global_config['model_dir'], global_config['model_filename'], global_config['model_dir'], global_config['model_filename'],
...@@ -198,8 +200,7 @@ def main(): ...@@ -198,8 +200,7 @@ def main():
global_config['batch_size'], global_config['batch_size'],
crop_size=img_size, crop_size=img_size,
resize_size=resize_size, resize_size=resize_size,
place=place), place=place), global_config['input_name']))
global_config['input_name']))
ac.compress() ac.compress()
......
...@@ -174,11 +174,11 @@ def reader(): ...@@ -174,11 +174,11 @@ def reader():
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type
): fn(samples) ): fn(samples)
train_batch_sampler = paddle.io.BatchSampler( train_batch_sampler = paddle.io.DistributedBatchSampler(
train_ds, batch_size=global_config['batch_size'], shuffle=True) train_ds, batch_size=global_config['batch_size'], shuffle=True)
[input_ids, token_type_ids, labels] = create_data_holder(global_config[ [input_ids, token_type_ids,
'task_name']) labels] = create_data_holder(global_config['task_name'])
feed_list_name = [] feed_list_name = []
train_data_loader = DataLoader( train_data_loader = DataLoader(
dataset=train_ds, dataset=train_ds,
...@@ -215,12 +215,13 @@ def reader(): ...@@ -215,12 +215,13 @@ def reader():
def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list): def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
metric.reset() metric.reset()
for data in eval_dataloader(): for data in eval_dataloader():
logits = exe.run(compiled_test_program, logits = exe.run(
feed={ compiled_test_program,
test_feed_names[0]: data[0]['input_ids'], feed={
test_feed_names[1]: data[0]['token_type_ids'] test_feed_names[0]: data[0]['input_ids'],
}, test_feed_names[1]: data[0]['token_type_ids']
fetch_list=test_fetch_list) },
fetch_list=test_fetch_list)
paddle.disable_static() paddle.disable_static()
labels_pd = paddle.to_tensor(np.array(data[0]['label']).flatten()) labels_pd = paddle.to_tensor(np.array(data[0]['label']).flatten())
logits_pd = paddle.to_tensor(logits[0]) logits_pd = paddle.to_tensor(logits[0])
...@@ -244,12 +245,13 @@ def eval(): ...@@ -244,12 +245,13 @@ def eval():
metric.reset() metric.reset()
print('Evaluating...') print('Evaluating...')
for data in eval_dataloader(): for data in eval_dataloader():
logits = exe.run(val_program, logits = exe.run(
feed={ val_program,
feed_target_names[0]: data[0]['input_ids'], feed={
feed_target_names[1]: data[0]['token_type_ids'] feed_target_names[0]: data[0]['input_ids'],
}, feed_target_names[1]: data[0]['token_type_ids']
fetch_list=fetch_targets) },
fetch_list=fetch_targets)
paddle.disable_static() paddle.disable_static()
labels_pd = paddle.to_tensor(np.array(data[0]['label']).flatten()) labels_pd = paddle.to_tensor(np.array(data[0]['label']).flatten())
logits_pd = paddle.to_tensor(logits[0]) logits_pd = paddle.to_tensor(logits[0])
......
...@@ -181,8 +181,7 @@ def reader_proprecess(data_path, max_seq_len=512): ...@@ -181,8 +181,7 @@ def reader_proprecess(data_path, max_seq_len=512):
cur_result_list = [] cur_result_list = []
for result in result_list: for result in result_list:
if result['start'] + 1 <= max_content_len < result[ if result['start'] + 1 <= max_content_len < result['end']:
'end']:
max_content_len = result['start'] max_content_len = result['start']
break break
...@@ -276,7 +275,7 @@ def reader(): ...@@ -276,7 +275,7 @@ def reader():
[input_ids, token_type_ids, start_ids, end_ids] = create_data_holder() [input_ids, token_type_ids, start_ids, end_ids] = create_data_holder()
train_batch_sampler = paddle.io.BatchSampler( train_batch_sampler = paddle.io.DistributedBatchSampler(
dataset=train_ds, batch_size=global_config['batch_size'], shuffle=True) dataset=train_ds, batch_size=global_config['batch_size'], shuffle=True)
train_dataloader = paddle.io.DataLoader( train_dataloader = paddle.io.DataLoader(
train_ds, train_ds,
...@@ -299,12 +298,13 @@ def reader(): ...@@ -299,12 +298,13 @@ def reader():
def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list): def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
metric.reset() metric.reset()
for data in eval_dataloader(): for data in eval_dataloader():
logits = exe.run(compiled_test_program, logits = exe.run(
feed={ compiled_test_program,
'input_ids': data[0]['input_ids'], feed={
'token_type_ids': data[0]['token_type_ids'], 'input_ids': data[0]['input_ids'],
}, 'token_type_ids': data[0]['token_type_ids'],
fetch_list=test_fetch_list) },
fetch_list=test_fetch_list)
paddle.disable_static() paddle.disable_static()
start_ids = paddle.to_tensor(np.array(data[0]['start_ids'])) start_ids = paddle.to_tensor(np.array(data[0]['start_ids']))
...@@ -313,8 +313,8 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list): ...@@ -313,8 +313,8 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
start_prob = paddle.to_tensor(logits[0]) start_prob = paddle.to_tensor(logits[0])
end_prob = paddle.to_tensor(logits[1]) end_prob = paddle.to_tensor(logits[1])
num_correct, num_infer, num_label = metric.compute(start_prob, end_prob, num_correct, num_infer, num_label = metric.compute(
start_ids, end_ids) start_prob, end_prob, start_ids, end_ids)
metric.update(num_correct, num_infer, num_label) metric.update(num_correct, num_infer, num_label)
paddle.enable_static() paddle.enable_static()
precision, recall, f1 = metric.accumulate() precision, recall, f1 = metric.accumulate()
......
...@@ -120,17 +120,14 @@ def create_data_holder(task_name, input_names): ...@@ -120,17 +120,14 @@ def create_data_holder(task_name, input_names):
inputs = [] inputs = []
for name in input_names: for name in input_names:
inputs.append( inputs.append(
paddle.static.data( paddle.static.data(name=name, shape=[-1, -1], dtype="int64"))
name=name, shape=[-1, -1], dtype="int64"))
if task_name == "sts-b": if task_name == "sts-b":
inputs.append( inputs.append(
paddle.static.data( paddle.static.data(name="label", shape=[-1, 1], dtype="float32"))
name="label", shape=[-1, 1], dtype="float32"))
else: else:
inputs.append( inputs.append(
paddle.static.data( paddle.static.data(name="label", shape=[-1, 1], dtype="int64"))
name="label", shape=[-1, 1], dtype="int64"))
return inputs return inputs
...@@ -164,7 +161,7 @@ def reader(): ...@@ -164,7 +161,7 @@ def reader():
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type
): fn(samples) ): fn(samples)
train_batch_sampler = paddle.io.BatchSampler( train_batch_sampler = paddle.io.DistributedBatchSampler(
train_ds, train_ds,
batch_size=global_config['batch_size'], batch_size=global_config['batch_size'],
shuffle=True, shuffle=True,
...@@ -257,13 +254,14 @@ def reader(): ...@@ -257,13 +254,14 @@ def reader():
def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list): def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
metric.reset() metric.reset()
for data in eval_dataloader(): for data in eval_dataloader():
logits = exe.run(compiled_test_program, logits = exe.run(
feed={ compiled_test_program,
test_feed_names[0]: data[0]['x0'], feed={
test_feed_names[1]: data[0]['x1'], test_feed_names[0]: data[0]['x0'],
test_feed_names[2]: data[0]['x2'] test_feed_names[1]: data[0]['x1'],
}, test_feed_names[2]: data[0]['x2']
fetch_list=test_fetch_list) },
fetch_list=test_fetch_list)
paddle.disable_static() paddle.disable_static()
if isinstance(metric, PearsonAndSpearman): if isinstance(metric, PearsonAndSpearman):
labels_pd = paddle.to_tensor(np.array(data[0]['label'])).reshape( labels_pd = paddle.to_tensor(np.array(data[0]['label'])).reshape(
...@@ -293,13 +291,14 @@ def eval(): ...@@ -293,13 +291,14 @@ def eval():
metric.reset() metric.reset()
print('Evaluating...') print('Evaluating...')
for data in eval_dataloader(): for data in eval_dataloader():
logits = exe.run(val_program, logits = exe.run(
feed={ val_program,
feed_target_names[0]: data[0]['x0'], feed={
feed_target_names[1]: data[0]['x1'], feed_target_names[0]: data[0]['x0'],
feed_target_names[2]: data[0]['x2'] feed_target_names[1]: data[0]['x1'],
}, feed_target_names[2]: data[0]['x2']
fetch_list=fetch_targets) },
fetch_list=fetch_targets)
paddle.disable_static() paddle.disable_static()
if isinstance(metric, PearsonAndSpearman): if isinstance(metric, PearsonAndSpearman):
labels_pd = paddle.to_tensor(np.array(data[0]['label'])).reshape( labels_pd = paddle.to_tensor(np.array(data[0]['label'])).reshape(
......
...@@ -67,10 +67,11 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list): ...@@ -67,10 +67,11 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
ncols=80) as t: ncols=80) as t:
for data in val_loader: for data in val_loader:
data_all = {k: np.array(v) for k, v in data.items()} data_all = {k: np.array(v) for k, v in data.items()}
outs = exe.run(compiled_test_program, outs = exe.run(
feed={test_feed_names[0]: data_all['image']}, compiled_test_program,
fetch_list=test_fetch_list, feed={test_feed_names[0]: data_all['image']},
return_numpy=False) fetch_list=test_fetch_list,
return_numpy=False)
res = postprocess(np.array(outs[0]), data_all['scale_factor']) res = postprocess(np.array(outs[0]), data_all['scale_factor'])
bboxes_list.append(res['bbox']) bboxes_list.append(res['bbox'])
bbox_nums_list.append(res['bbox_num']) bbox_nums_list.append(res['bbox_num'])
...@@ -93,12 +94,10 @@ def main(): ...@@ -93,12 +94,10 @@ def main():
paddle.vision.image.set_image_backend('cv2') paddle.vision.image.set_image_backend('cv2')
train_dataset = paddle.vision.datasets.ImageFolder( train_dataset = paddle.vision.datasets.ImageFolder(
global_config['image_path'], transform=yolo_image_preprocess) global_config['image_path'], transform=yolo_image_preprocess)
batch_sampler = paddle.io.DistributedBatchSampler(
train_dataset, batch_size=1, shuffle=True, drop_last=True)
train_loader = paddle.io.DataLoader( train_loader = paddle.io.DataLoader(
train_dataset, train_dataset, batch_sampler=batch_sampler, num_workers=0)
batch_size=1,
shuffle=True,
drop_last=True,
num_workers=0)
train_loader = reader_wrapper(train_loader, input_name=input_name) train_loader = reader_wrapper(train_loader, input_name=input_name)
eval_func = None eval_func = None
else: else:
...@@ -107,8 +106,10 @@ def main(): ...@@ -107,8 +106,10 @@ def main():
image_dir=global_config['coco_train_image_dir'], image_dir=global_config['coco_train_image_dir'],
anno_path=global_config['coco_train_anno_path'], anno_path=global_config['coco_train_anno_path'],
input_name=input_name) input_name=input_name)
batch_sampler = paddle.io.DistributedBatchSampler(
dataset, batch_size=1, shuffle=True, drop_last=True)
train_loader = paddle.io.DataLoader( train_loader = paddle.io.DataLoader(
dataset, batch_size=1, shuffle=True, drop_last=True, num_workers=0) dataset, batch_size=1, num_workers=0, batch_sampler=batch_sampler)
if paddle.distributed.get_rank() == 0: if paddle.distributed.get_rank() == 0:
eval_func = eval_function eval_func = eval_function
global val_loader global val_loader
......
...@@ -21,7 +21,7 @@ from functools import partial ...@@ -21,7 +21,7 @@ from functools import partial
import numpy as np import numpy as np
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
from paddle.io import DataLoader from paddle.io import DataLoader, DistributedBatchSampler
from imagenet_reader import ImageNetDataset from imagenet_reader import ImageNetDataset
from paddleslim.common import load_config as load_slim_config from paddleslim.common import load_config as load_slim_config
from paddleslim.auto_compression import AutoCompression from paddleslim.auto_compression import AutoCompression
...@@ -72,9 +72,10 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list): ...@@ -72,9 +72,10 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
if len(test_feed_names) == 1: if len(test_feed_names) == 1:
image = np.array(image) image = np.array(image)
label = np.array(label).astype('int64') label = np.array(label).astype('int64')
pred = exe.run(compiled_test_program, pred = exe.run(
feed={test_feed_names[0]: image}, compiled_test_program,
fetch_list=test_fetch_list) feed={test_feed_names[0]: image},
fetch_list=test_fetch_list)
pred = np.array(pred[0]) pred = np.array(pred[0])
label = np.array(label) label = np.array(label)
sort_array = pred.argsort(axis=1) sort_array = pred.argsort(axis=1)
...@@ -114,13 +115,13 @@ def main(): ...@@ -114,13 +115,13 @@ def main():
data_dir = global_config['data_dir'] data_dir = global_config['data_dir']
train_dataset = ImageNetDataset(mode='train', data_dir=data_dir) train_dataset = ImageNetDataset(mode='train', data_dir=data_dir)
batch_sampler = DistributedBatchSampler(
train_loader = DataLoader(
train_dataset, train_dataset,
batch_size=global_config['batch_size'], batch_size=global_config['batch_size'],
shuffle=True, shuffle=True,
drop_last=True, drop_last=True)
num_workers=0) train_loader = DataLoader(
train_dataset, batch_sampler=batch_sampler, num_workers=0)
train_dataloader = reader_wrapper(train_loader, global_config['input_name']) train_dataloader = reader_wrapper(train_loader, global_config['input_name'])
ac = AutoCompression( ac = AutoCompression(
......
...@@ -23,7 +23,7 @@ from tqdm import tqdm ...@@ -23,7 +23,7 @@ from tqdm import tqdm
import numpy as np import numpy as np
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
from paddle.io import DataLoader from paddle.io import DataLoader, DistributedBatchSampler
from imagenet_reader import ImageNetDataset from imagenet_reader import ImageNetDataset
from paddleslim.common import load_config as load_slim_config from paddleslim.common import load_config as load_slim_config
from paddleslim.auto_compression import AutoCompression from paddleslim.auto_compression import AutoCompression
...@@ -97,9 +97,10 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list): ...@@ -97,9 +97,10 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
if len(test_feed_names) == 1: if len(test_feed_names) == 1:
image = np.array(image) image = np.array(image)
label = np.array(label).astype('int64') label = np.array(label).astype('int64')
pred = exe.run(compiled_test_program, pred = exe.run(
feed={test_feed_names[0]: image}, compiled_test_program,
fetch_list=test_fetch_list) feed={test_feed_names[0]: image},
fetch_list=test_fetch_list)
pred = np.array(pred[0]) pred = np.array(pred[0])
label = np.array(label) label = np.array(label)
sort_array = pred.argsort(axis=1) sort_array = pred.argsort(axis=1)
...@@ -116,12 +117,11 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list): ...@@ -116,12 +117,11 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
# eval "eval model", which inputs are image and label, output is top1 and top5 accuracy # eval "eval model", which inputs are image and label, output is top1 and top5 accuracy
image = np.array(image) image = np.array(image)
label = np.array(label).astype('int64') label = np.array(label).astype('int64')
result = exe.run(compiled_test_program, result = exe.run(
feed={ compiled_test_program,
test_feed_names[0]: image, feed={test_feed_names[0]: image,
test_feed_names[1]: label test_feed_names[1]: label},
}, fetch_list=test_fetch_list)
fetch_list=test_fetch_list)
result = [np.mean(r) for r in result] result = [np.mean(r) for r in result]
results.append(result) results.append(result)
t.update() t.update()
...@@ -144,13 +144,13 @@ def main(): ...@@ -144,13 +144,13 @@ def main():
global_config = all_config["Global"] global_config = all_config["Global"]
gpu_num = paddle.distributed.get_world_size() gpu_num = paddle.distributed.get_world_size()
if isinstance(all_config['TrainConfig']['learning_rate'], if isinstance(
dict) and all_config['TrainConfig']['learning_rate'][ all_config['TrainConfig']['learning_rate'], dict
'type'] == 'CosineAnnealingDecay': ) and all_config['TrainConfig']['learning_rate']['type'] == 'CosineAnnealingDecay':
step = int( step = int(
math.ceil( math.ceil(
float(args.total_images) / (global_config['batch_size'] * float(args.total_images) / (
gpu_num))) global_config['batch_size'] * gpu_num)))
all_config['TrainConfig']['learning_rate']['T_max'] = step all_config['TrainConfig']['learning_rate']['T_max'] = step
print('total training steps:', step) print('total training steps:', step)
...@@ -167,13 +167,15 @@ def main(): ...@@ -167,13 +167,15 @@ def main():
data_dir=data_dir, data_dir=data_dir,
crop_size=img_size, crop_size=img_size,
resize_size=resize_size) resize_size=resize_size)
batch_sampler = DistributedBatchSampler(
train_loader = DataLoader(
train_dataset, train_dataset,
places=[place],
batch_size=global_config['batch_size'], batch_size=global_config['batch_size'],
shuffle=True, shuffle=True,
drop_last=True, drop_last=True)
train_loader = DataLoader(
train_dataset,
places=[place],
batch_sampler=batch_sampler,
num_workers=0) num_workers=0)
train_dataloader = reader_wrapper(train_loader, global_config['input_name']) train_dataloader = reader_wrapper(train_loader, global_config['input_name'])
...@@ -191,8 +193,7 @@ def main(): ...@@ -191,8 +193,7 @@ def main():
global_config['batch_size'], global_config['batch_size'],
crop_size=img_size, crop_size=img_size,
resize_size=resize_size, resize_size=resize_size,
place=place), place=place), global_config['input_name']))
global_config['input_name']))
ac.compress() ac.compress()
......
...@@ -157,10 +157,10 @@ class AutoCompression: ...@@ -157,10 +157,10 @@ class AutoCompression:
self.deploy_hardware = deploy_hardware self.deploy_hardware = deploy_hardware
paddle.enable_static() paddle.enable_static()
self._exe, self._places = self._prepare_envs() self._exe, self._places, self.fleet = self._prepare_envs()
self.default_distill_node_pair, self.model_type = self._get_model_info() self.default_distill_node_pair, self.model_type = self._get_model_info()
if self.train_config is not None and self.train_config.use_fleet: if self.fleet:
fleet.init(is_collective=True) fleet.init(is_collective=True)
if with_variable_shape( if with_variable_shape(
...@@ -306,7 +306,8 @@ class AutoCompression: ...@@ -306,7 +306,8 @@ class AutoCompression:
places = paddle.device._convert_to_place(devices) places = paddle.device._convert_to_place(devices)
_logger.info(f"devices: {devices}") _logger.info(f"devices: {devices}")
exe = paddle.static.Executor(places) exe = paddle.static.Executor(places)
return exe, places fleet = paddle.device.cuda.device_count() > 1
return exe, places, fleet
def _get_model_info(self): def _get_model_info(self):
[inference_program, _, _] = (load_inference_model( [inference_program, _, _] = (load_inference_model(
...@@ -443,9 +444,8 @@ class AutoCompression: ...@@ -443,9 +444,8 @@ class AutoCompression:
return strategy, config return strategy, config
def _prepare_fleet_strategy(train_config): def _prepare_fleet_strategy(self, train_config):
build_strategy = paddle.static.BuildStrategy() build_strategy = paddle.static.BuildStrategy()
exec_strategy = paddle.static.ExecutionStrategy()
strategy = fleet.DistributedStrategy() strategy = fleet.DistributedStrategy()
strategy.build_strategy = build_strategy strategy.build_strategy = build_strategy
...@@ -458,8 +458,6 @@ class AutoCompression: ...@@ -458,8 +458,6 @@ class AutoCompression:
if train_config.amp_config is not None: if train_config.amp_config is not None:
strategy.amp = True strategy.amp = True
strategy.amp_configs = {**train_config.amp_config} strategy.amp_configs = {**train_config.amp_config}
if train_config.asp_config is not None:
strategy.asp = True
return strategy return strategy
def _prepare_program(self, program, feed_target_names, fetch_targets, def _prepare_program(self, program, feed_target_names, fetch_targets,
...@@ -498,7 +496,7 @@ class AutoCompression: ...@@ -498,7 +496,7 @@ class AutoCompression:
self._exe, self._places, config_dict, train_program_info, self._exe, self._places, config_dict, train_program_info,
strategy, patterns, self.eval_dataloader) strategy, patterns, self.eval_dataloader)
if train_config.use_fleet: if self.fleet:
dist_strategy = self._prepare_fleet_strategy(train_config) dist_strategy = self._prepare_fleet_strategy(train_config)
else: else:
dist_strategy = None dist_strategy = None
...@@ -534,7 +532,7 @@ class AutoCompression: ...@@ -534,7 +532,7 @@ class AutoCompression:
self._exe.run(train_program_info.startup_program) self._exe.run(train_program_info.startup_program)
if (not train_config.use_fleet) and train_config.amp_config is not None: if (not self.fleet) and train_config.amp_config is not None:
if hasattr( if hasattr(
train_config.amp_config, train_config.amp_config,
'use_pure_fp16') and train_config.amp_config.use_pure_fp16: 'use_pure_fp16') and train_config.amp_config.use_pure_fp16:
...@@ -545,7 +543,7 @@ class AutoCompression: ...@@ -545,7 +543,7 @@ class AutoCompression:
### prune weight in scope ### prune weight in scope
self._pruner.prune_model(train_program_info.program) self._pruner.prune_model(train_program_info.program)
if not train_config.use_fleet: if not self.fleet:
train_program_info = self._compiled_program(train_program_info, train_program_info = self._compiled_program(train_program_info,
strategy) strategy)
test_program_info = self._compiled_program(test_program_info, test_program_info = self._compiled_program(test_program_info,
...@@ -553,19 +551,16 @@ class AutoCompression: ...@@ -553,19 +551,16 @@ class AutoCompression:
return train_program_info, test_program_info return train_program_info, test_program_info
def _compiled_program(self, program_info, strategy): def _compiled_program(self, program_info, strategy):
compiled_prog = paddle.static.CompiledProgram(program_info.program)
build_strategy = paddle.static.BuildStrategy() build_strategy = paddle.static.BuildStrategy()
exec_strategy = paddle.static.ExecutionStrategy()
if 'qat' in strategy: if 'qat' in strategy:
build_strategy.memory_optimize = False build_strategy.memory_optimize = False
build_strategy.enable_inplace = False build_strategy.enable_inplace = False
build_strategy.fuse_all_reduce_ops = False build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False build_strategy.sync_batch_norm = False
compiled_prog = compiled_prog.with_data_parallel( compiled_prog = paddle.static.CompiledProgram(
loss_name=program_info.fetch_targets[0].name, program_info.program, build_strategy=build_strategy)
build_strategy=build_strategy,
exec_strategy=exec_strategy)
program_info.program = compiled_prog program_info.program = compiled_prog
return program_info return program_info
...@@ -823,8 +818,12 @@ class AutoCompression: ...@@ -823,8 +818,12 @@ class AutoCompression:
test_program_info.fetch_targets) test_program_info.fetch_targets)
if metric > best_metric: if metric > best_metric:
tmp_program = test_program_info.program._program if isinstance(
test_program_info.program,
paddle.static.CompiledProgram
) else test_program_info.program
paddle.static.save( paddle.static.save(
program=test_program_info.program._program, program=tmp_program,
model_path=os.path.join(self.tmp_dir, model_path=os.path.join(self.tmp_dir,
'best_model')) 'best_model'))
best_metric = metric best_metric = metric
......
...@@ -346,7 +346,7 @@ def build_distill_program(executor, ...@@ -346,7 +346,7 @@ def build_distill_program(executor,
with paddle.utils.unique_name.guard('merge'): with paddle.utils.unique_name.guard('merge'):
optimizer, learning_rate = _create_optimizer(train_config) optimizer, learning_rate = _create_optimizer(train_config)
if train_config.get('use_fleet'): if dist_strategy is not None:
optimizer = fleet.distributed_optimizer(optimizer, optimizer = fleet.distributed_optimizer(optimizer,
dist_strategy) dist_strategy)
else: else:
...@@ -385,8 +385,7 @@ def build_distill_program(executor, ...@@ -385,8 +385,7 @@ def build_distill_program(executor,
loss.stop_gradient = False loss.stop_gradient = False
if 'prune_params_name' in config: ### prune if 'prune_params_name' in config: ### prune
if 'pruned_ratio' not in config and not train_config.get( if 'pruned_ratio' not in config and dist_strategy is None: ### asp
'use_fleet'): ### asp
optimizer = pruner.decorate(optimizer) optimizer = pruner.decorate(optimizer)
optimizer.minimize(loss) optimizer.minimize(loss)
elif 'prune_strategy' in config: ###unstructure prune elif 'prune_strategy' in config: ###unstructure prune
......
...@@ -339,7 +339,6 @@ class TrainConfig: ...@@ -339,7 +339,6 @@ class TrainConfig:
logging_iter=10, logging_iter=10,
origin_metric=None, origin_metric=None,
target_metric=None, target_metric=None,
use_fleet=False,
amp_config=None, amp_config=None,
recompute_config=None, recompute_config=None,
sharding_config=None, sharding_config=None,
...@@ -372,8 +371,7 @@ class TrainConfig: ...@@ -372,8 +371,7 @@ class TrainConfig:
logging_iter(int): Log period in batches. Default: 10. logging_iter(int): Log period in batches. Default: 10.
origin_metric(float, optional): The Metric of model before compress, used to check whether the dataloader is correct if is not None. Default: None. origin_metric(float, optional): The Metric of model before compress, used to check whether the dataloader is correct if is not None. Default: None.
target_metric(float, optional): The Metric of model after compress, if set target metric, the metric of compressed model satisfy the requirements, will be stop training. If not set, will train epochs as users set. Default: None. target_metric(float, optional): The Metric of model after compress, if set target metric, the metric of compressed model satisfy the requirements, will be stop training. If not set, will train epochs as users set. Default: None.
use_fleet(bool): Whether to use fleet. Default: False. amp_config(dict, optional): The dictionary contains all the configs of amp. Default: None. The detailed description is as below when turning on distributed training:
amp_config(dict, optional): The dictionary contains all the configs of amp. Default: None. The detailed description is as below if use_fleet=False:
.. code-block:: python .. code-block:: python
AMP-O1 `<https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/amp_cn.html#id2>`_ : AMP-O1 `<https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/amp_cn.html#id2>`_ :
{'custom_white_list', set} # The custom white_list. It's the set of ops that support {'custom_white_list', set} # The custom white_list. It's the set of ops that support
...@@ -389,10 +387,10 @@ class TrainConfig: ...@@ -389,10 +387,10 @@ class TrainConfig:
{'use_fp16_guard': bool} # Whether to use `fp16_guard` when constructing the program. {'use_fp16_guard': bool} # Whether to use `fp16_guard` when constructing the program.
.. ..
If you want to use AMP-O2, you need to set use_pure_fp16 is True and use_fp16_guard is False. If you want to use AMP-O2, you need to set use_pure_fp16 is True and use_fp16_guard is False.
If use_fleet=True, the key of amp_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#amp-configs>`_. when turning on distributed training, the key of amp_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#amp-configs>`_.
recompute_config(dict, optional): The dictionary contains all the configs of recompute. Default: None. The recompute config only can be set when use_fleet=True, the key of recompute_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#recompute-configs>`_. recompute_config(dict, optional): The dictionary contains all the configs of recompute. Default: None. The recompute config only can be set when turning on distributed training, the key of recompute_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#recompute-configs>`_.
sharding_config(dict, optional): The dictionary contains all the configs of sharding. Default: None. The sharding config only can be set when use_fleet=True, the key of sharding_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#sharding-configs>`_. sharding_config(dict, optional): The dictionary contains all the configs of sharding. Default: None. The sharding config only can be set when turning on distributed training, the key of sharding_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#sharding-configs>`_.
sparse_model(bool, optional): Set sparse_model to ``True`` to remove mask tensor when the compress strategy is unstructure prune. Default: False. sparse_model(bool, optional): Set sparse_model to ``True`` to remove mask tensor when the compress strategy is unstructure prune. Default: False.
""" """
self.epochs = epochs self.epochs = epochs
...@@ -403,7 +401,6 @@ class TrainConfig: ...@@ -403,7 +401,6 @@ class TrainConfig:
self.logging_iter = logging_iter self.logging_iter = logging_iter
self.origin_metric = origin_metric self.origin_metric = origin_metric
self.target_metric = target_metric self.target_metric = target_metric
self.use_fleet = use_fleet
self.amp_config = amp_config self.amp_config = amp_config
self.recompute_config = recompute_config self.recompute_config = recompute_config
self.sharding_config = sharding_config self.sharding_config = sharding_config
......
...@@ -103,13 +103,11 @@ class VarCollector(object): ...@@ -103,13 +103,11 @@ class VarCollector(object):
def run(self, reader, exe, step=None, loss_name=None): def run(self, reader, exe, step=None, loss_name=None):
if not hasattr(self.program, '_program'): if not hasattr(self.program, '_program'):
# Compile the native program to speed up # Compile the native program to speed up
program = paddle.static.CompiledProgram( program = paddle.static.CompiledProgram(self.program)
self.program).with_data_parallel(loss_name=loss_name)
for idx, data in enumerate(reader): for idx, data in enumerate(reader):
vars_np = exe.run(program=program, vars_np = exe.run(
feed=data, program=program, feed=data, fetch_list=self.real_names)
fetch_list=self.real_names)
mapped_vars_np = dict(zip(self.real_names, vars_np)) mapped_vars_np = dict(zip(self.real_names, vars_np))
values = self.update(mapped_vars_np) values = self.update(mapped_vars_np)
...@@ -129,8 +127,7 @@ class VarCollector(object): ...@@ -129,8 +127,7 @@ class VarCollector(object):
if not hasattr(self.program, '_program'): if not hasattr(self.program, '_program'):
# Compile the native program to speed up # Compile the native program to speed up
program = paddle.static.CompiledProgram( program = paddle.static.CompiledProgram(self.program)
self.program).with_data_parallel(loss_name=loss_name)
for idx, data in enumerate(reader): for idx, data in enumerate(reader):
vars_np = exe.run(program=program, feed=data, fetch_list=fetch_list) vars_np = exe.run(program=program, feed=data, fetch_list=fetch_list)
vars_np = [np.max(var) for var in vars_np] vars_np = [np.max(var) for var in vars_np]
...@@ -172,5 +169,5 @@ class VarCollector(object): ...@@ -172,5 +169,5 @@ class VarCollector(object):
plt.show() plt.show()
pdf.savefig() pdf.savefig()
plt.close() plt.close()
_logger.info("variables histogram have been saved as {}".format( _logger.info(
pdf_path)) "variables histogram have been saved as {}".format(pdf_path))
...@@ -107,17 +107,14 @@ def _parse_train_configs(train_config): ...@@ -107,17 +107,14 @@ def _parse_train_configs(train_config):
def _compile_program(program, fetch_var_name): def _compile_program(program, fetch_var_name):
"""compiling program""" """compiling program"""
compiled_prog = paddle.static.CompiledProgram(program)
build_strategy = paddle.static.BuildStrategy() build_strategy = paddle.static.BuildStrategy()
build_strategy.memory_optimize = False build_strategy.memory_optimize = False
build_strategy.enable_inplace = False build_strategy.enable_inplace = False
build_strategy.fuse_all_reduce_ops = False build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False build_strategy.sync_batch_norm = False
exec_strategy = paddle.static.ExecutionStrategy() compiled_prog = paddle.static.CompiledProgram(
compiled_prog = compiled_prog.with_data_parallel( program, build_strategy=build_strategy)
loss_name=fetch_var_name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
return compiled_prog return compiled_prog
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册