未验证 提交 e33dc481 编写于 作者: Z zhouzj 提交者: GitHub

remove with_data_parallel (#1658)

* remove with_data_parallel

* ACT adapts fleet

* ACT'demo adapts fleet

* fix bugs
上级 65c776de
......@@ -13,7 +13,6 @@ import paddle.static as static
from paddleslim.analysis import flops
from paddleslim.nas import SANAS
from paddleslim.common import get_logger
from optimizer import create_optimizer
import imagenet_reader
_logger = get_logger(__name__, level=logging.INFO)
......@@ -157,15 +156,13 @@ def search_mobilenetv2_block(config, args, image_size):
build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel(
loss_name=avg_cost.name, build_strategy=build_strategy)
train_program, build_strategy=build_strategy)
for epoch_id in range(args.retain_epoch):
for batch_id, data in enumerate(train_loader()):
fetches = [avg_cost.name]
s_time = time.time()
outs = exe.run(train_compiled_program,
feed=data,
fetch_list=fetches)[0]
outs = exe.run(
train_compiled_program, feed=data, fetch_list=fetches)[0]
batch_time = time.time() - s_time
if batch_id % 10 == 0:
_logger.info(
......@@ -175,9 +172,8 @@ def search_mobilenetv2_block(config, args, image_size):
reward = []
for batch_id, data in enumerate(val_loader()):
test_fetches = [avg_cost.name, acc_top1.name, acc_top5.name]
batch_reward = exe.run(test_program,
feed=data,
fetch_list=test_fetches)
batch_reward = exe.run(
test_program, feed=data, fetch_list=test_fetches)
reward_avg = np.mean(np.array(batch_reward), axis=1)
reward.append(reward_avg)
......
......@@ -141,15 +141,13 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel(
loss_name=avg_cost.name, build_strategy=build_strategy)
train_program, build_strategy=build_strategy)
for epoch_id in range(args.retain_epoch):
for batch_id, data in enumerate(train_loader()):
fetches = [avg_cost.name]
s_time = time.time()
outs = exe.run(train_compiled_program,
feed=data,
fetch_list=fetches)[0]
outs = exe.run(
train_compiled_program, feed=data, fetch_list=fetches)[0]
batch_time = time.time() - s_time
if batch_id % 10 == 0:
_logger.info(
......@@ -161,9 +159,8 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
test_fetches = [
test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
]
batch_reward = exe.run(test_program,
feed=data,
fetch_list=test_fetches)
batch_reward = exe.run(
test_program, feed=data, fetch_list=test_fetches)
reward_avg = np.mean(np.array(batch_reward), axis=1)
reward.append(reward_avg)
......
......@@ -134,15 +134,13 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel(
loss_name=avg_cost.name, build_strategy=build_strategy)
train_program, build_strategy=build_strategy)
for epoch_id in range(args.retain_epoch):
for batch_id, data in enumerate(train_loader()):
fetches = [avg_cost.name]
s_time = time.time()
outs = exe.run(train_compiled_program,
feed=data,
fetch_list=fetches)[0]
outs = exe.run(
train_compiled_program, feed=data, fetch_list=fetches)[0]
batch_time = time.time() - s_time
if batch_id % 10 == 0:
_logger.info(
......@@ -154,9 +152,8 @@ def search_mobilenetv2(config, args, image_size, is_server=True):
test_fetches = [
test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
]
batch_reward = exe.run(test_program,
feed=data,
fetch_list=test_fetches)
batch_reward = exe.run(
test_program, feed=data, fetch_list=test_fetches)
reward_avg = np.mean(np.array(batch_reward), axis=1)
reward.append(reward_avg)
......@@ -223,15 +220,13 @@ def test_search_result(tokens, image_size, args, config):
build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel(
loss_name=avg_cost.name, build_strategy=build_strategy)
train_program, build_strategy=build_strategy)
for epoch_id in range(args.retain_epoch):
for batch_id, data in enumerate(train_loader()):
fetches = [avg_cost.name]
s_time = time.time()
outs = exe.run(train_compiled_program,
feed=data,
fetch_list=fetches)[0]
outs = exe.run(
train_compiled_program, feed=data, fetch_list=fetches)[0]
batch_time = time.time() - s_time
if batch_id % 10 == 0:
_logger.info(
......@@ -243,9 +238,8 @@ def test_search_result(tokens, image_size, args, config):
test_fetches = [
test_avg_cost.name, test_acc_top1.name, test_acc_top5.name
]
batch_reward = exe.run(test_program,
feed=data,
fetch_list=test_fetches)
batch_reward = exe.run(
test_program, feed=data, fetch_list=test_fetches)
reward_avg = np.mean(np.array(batch_reward), axis=1)
reward.append(reward_avg)
......
......@@ -119,8 +119,8 @@ def train(main_prog, exe, epoch_id, train_loader, fetch_list, args):
[[drop_path_probility * epoch_id / args.retain_epoch]
for i in range(args.batch_size)]).astype(np.float32)
drop_path_mask = 1 - np.random.binomial(
1, drop_path_prob[0],
size=[args.batch_size, 20, 4, 2]).astype(np.float32)
1, drop_path_prob[0], size=[args.batch_size, 20, 4, 2
]).astype(np.float32)
feed.append({
"image": image,
"label": label,
......@@ -195,8 +195,8 @@ def search(config, args, image_size, is_server=True):
current_params = count_parameters_in_MB(
train_program.global_block().all_parameters(), 'cifar10')
_logger.info('step: {}, current_params: {}M'.format(step,
current_params))
_logger.info(
'step: {}, current_params: {}M'.format(step, current_params))
if current_params > float(3.77):
continue
......@@ -222,9 +222,7 @@ def search(config, args, image_size, is_server=True):
build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel(
loss_name=train_fetch_list[0].name,
build_strategy=build_strategy)
train_program, build_strategy=build_strategy)
valid_top1_list = []
for epoch_id in range(args.retain_epoch):
......@@ -234,8 +232,8 @@ def search(config, args, image_size, is_server=True):
step, epoch_id, train_top1))
valid_top1 = valid(test_program, exe, epoch_id, test_loader,
test_fetch_list, args)
_logger.info("TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id,
valid_top1))
_logger.info(
"TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id, valid_top1))
valid_top1_list.append(valid_top1)
sa_nas.reward(float(valid_top1_list[-1] + valid_top1_list[-2]) / 2)
......@@ -276,19 +274,18 @@ def final_test(config, args, image_size, token=None):
build_strategy = static.BuildStrategy()
train_compiled_program = static.CompiledProgram(
train_program).with_data_parallel(
loss_name=train_fetch_list[0].name, build_strategy=build_strategy)
train_program, build_strategy=build_strategy)
valid_top1_list = []
for epoch_id in range(args.retain_epoch):
train_top1 = train(train_compiled_program, exe, epoch_id, train_loader,
train_fetch_list, args)
_logger.info("TRAIN: Epoch {}, train_acc {:.6f}".format(epoch_id,
train_top1))
_logger.info(
"TRAIN: Epoch {}, train_acc {:.6f}".format(epoch_id, train_top1))
valid_top1 = valid(test_program, exe, epoch_id, test_loader,
test_fetch_list, args)
_logger.info("TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id,
valid_top1))
_logger.info(
"TEST: Epoch {}, valid_acc {:.6f}".format(epoch_id, valid_top1))
valid_top1_list.append(valid_top1)
output_dir = os.path.join('darts_output', str(epoch_id))
......
......@@ -34,6 +34,7 @@ tar -xf MobileNetV1_pretrained.tar
通过以下命令启动裁剪任务:
- 单卡启动:
```
export CUDA_VISIBLE_DEVICES=0
python train.py \
......@@ -43,9 +44,18 @@ python train.py \
--criterion "l1_norm"
```
其中,`model`用于指定待裁剪的模型。`pruned_ratio`用于指定各个卷积层通道数被裁剪的比例。`data`选项用于指定使用的数据集。
`criterion` 选项用于指定所使用的剪裁算法策略,现在支持`l1_norm`, `bn_scale`, `geometry_median`。默认为`l1_norm`。可以
设置该参数以改变剪裁算法策略。该目录下的四个shell脚本文件是在ResNet34, MobileNetV1, MobileNetV2等三个模型上进行的四组
- 多卡启动:
```
export CUDA_VISIBLE_DEVICES=0, 1
python -m paddle.distributed.launch train.py \
--model "MobileNet" \
--pruned_ratio 0.31 \
--data "mnist" \
--criterion "l1_norm" \
--fleet
```
其中,`model`用于指定待裁剪的模型。`pruned_ratio`用于指定各个卷积层通道数被裁剪的比例。`data`选项用于指定使用的数据集。`criterion` 选项用于指定所使用的剪裁算法策略,现在支持`l1_norm`, `bn_scale`, `geometry_median`,默认为`l1_norm``fleet` 用于开启多卡训练,在多卡启动时需要调用该参数。该目录下的四个shell脚本文件是在ResNet34, MobileNetV1, MobileNetV2等三个模型上进行的四组
`criterion`设置为`geometry_median`的实验,可以直接运行脚本文件启动剪裁实验。
执行`python train.py --help`查看更多选项。
......
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1
export FLAGS_fraction_of_gpu_memory_to_use=0.98
python train.py \
python -m paddle.distributed.launch train.py \
--model="MobileNet" \
--pretrained_model="/workspace/models/MobileNetV1_pretrained" \
--data="imagenet" \
......@@ -14,4 +14,5 @@ python train.py \
--lr_strategy="piecewise_decay" \
--criterion="geometry_median" \
--model_path="./fpgm_mobilenetv1_models" \
--fleet \
2>&1 | tee fpgm_mobilenetv1_train.log
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1
export FLAGS_fraction_of_gpu_memory_to_use=0.98
python train.py \
python -m paddle.distributed.launch train.py \
--model="MobileNetV2" \
--pretrained_model="/workspace/models/MobileNetV2_pretrained" \
--data="imagenet" \
......@@ -14,4 +14,5 @@ python train.py \
--lr_strategy="piecewise_decay" \
--criterion="geometry_median" \
--model_path="./fpgm_mobilenetv2_models" \
--fleet \
2>&1 | tee fpgm_mobilenetv2_train.log
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1,2,3
export FLAGS_fraction_of_gpu_memory_to_use=0.98
python train.py \
python -m paddle.distributed.launch train.py \
--model="ResNet34" \
--pretrained_model="/workspace/models/ResNet34_pretrained" \
--data="imagenet" \
......@@ -9,4 +9,5 @@ python train.py \
--lr_strategy="cosine_decay" \
--criterion="geometry_median" \
--model_path="./fpgm_resnet34_025_120_models" \
--fleet \
2>&1 | tee fpgm_resnet025_120_train.log
#!/bin/bash
export CUDA_VISIBLE_DEVICES=0,1
export FLAGS_fraction_of_gpu_memory_to_use=0.98
python train.py \
python -m paddle.distributed.launch train.py \
--model="ResNet34" \
--pretrained_model="/workspace/models/ResNet34_pretrained" \
--data="imagenet" \
......@@ -14,4 +14,5 @@ python train.py \
--lr_strategy="piecewise_decay" \
--criterion="geometry_median" \
--model_path="./fpgm_resnet34_models" \
--fleet \
2>&1 | tee fpgm_resnet03_train.log
......@@ -15,6 +15,7 @@ from paddleslim.analysis import flops
import models
from utility import add_arguments, print_arguments
import paddle.vision.transforms as T
from paddle.distributed import fleet
_logger = get_logger(__name__, level=logging.INFO)
......@@ -40,6 +41,7 @@ add_arg('pruned_ratio', float, None, "The ratios to be pruned.")
add_arg('criterion', str, "l1_norm", "The prune criterion to be used, support l1_norm and batch_norm_scale.")
add_arg('save_inference', bool, False, "Whether to save inference model.")
add_arg('ce_test', bool, False, "Whether to CE test.")
parser.add_argument('fleet', action='store_true', help="Whether to turn on distributed training.")
# yapf: enable
model_list = models.__all__
......@@ -96,6 +98,8 @@ def create_optimizer(args, step_per_epoch):
def compress(args):
if args.fleet:
fleet.init(is_collective=True)
num_workers = 4
shuffle = True
......@@ -130,8 +134,8 @@ def compress(args):
else:
raise ValueError("{} is not supported.".format(args.data))
image_shape = [int(m) for m in image_shape.split(",")]
assert args.model in model_list, "{} is not in lists: {}".format(args.model,
model_list)
assert args.model in model_list, "{} is not in lists: {}".format(
args.model, model_list)
places = paddle.static.cuda_places(
) if args.use_gpu else paddle.static.cpu_places()
place = places[0]
......@@ -140,13 +144,16 @@ def compress(args):
name='image', shape=[None] + image_shape, dtype='float32')
label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
batch_size_per_card = int(args.batch_size / len(places))
sampler = paddle.io.DistributedBatchSampler(
train_dataset,
shuffle=shuffle,
drop_last=True,
batch_size=batch_size_per_card)
train_loader = paddle.io.DataLoader(
train_dataset,
places=places,
feed_list=[image, label],
drop_last=True,
batch_size=batch_size_per_card,
shuffle=shuffle,
batch_sampler=sampler,
return_list=False,
use_shared_memory=True,
num_workers=num_workers)
......@@ -171,6 +178,8 @@ def compress(args):
acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5)
val_program = paddle.static.default_main_program().clone(for_test=True)
opt, learning_rate = create_optimizer(args, step_per_epoch)
if args.fleet:
opt = fleet.distributed_optimizer(opt)
opt.minimize(avg_cost)
exe.run(paddle.static.default_startup_program())
......@@ -180,8 +189,8 @@ def compress(args):
def if_exist(var):
return os.path.exists(os.path.join(args.pretrained_model, var.name))
_logger.info("Load pretrained model from {}".format(
args.pretrained_model))
_logger.info(
"Load pretrained model from {}".format(args.pretrained_model))
paddle.static.load(paddle.static.default_main_program(),
args.pretrained_model, exe)
......@@ -247,13 +256,10 @@ def compress(args):
place=place)
_logger.info("FLOPs after pruning: {}".format(flops(pruned_program)))
build_strategy = paddle.static.BuildStrategy()
exec_strategy = paddle.static.ExecutionStrategy()
train_program = paddle.static.CompiledProgram(
pruned_program).with_data_parallel(
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
if args.fleet:
train_program = paddle.static.CompiledProgram(pruned_program)
else:
train_program = pruned_program
for i in range(args.num_epochs):
train(i, train_program)
......@@ -268,8 +274,8 @@ def compress(args):
infer_model_path, [image], [out],
exe,
program=pruned_val_program)
_logger.info("Saved inference model into [{}]".format(
infer_model_path))
_logger.info(
"Saved inference model into [{}]".format(infer_model_path))
def main():
......
......@@ -143,16 +143,7 @@ compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, a
### 关掉指定build策略
```
build_strategy = paddle.static.BuildStrategy()
build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False
exec_strategy = paddle.static.ExecutionStrategy()
compiled_train_prog = compiled_train_prog.with_data_parallel(
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
```
### 训练命令
......@@ -192,8 +183,10 @@ python train.py
多卡启动PACT量化训练:
```
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 train.py --batch_size=64
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 train.py --batch_size=64 --fleet
```
> 多卡训练需要调用 `fleet` 参数。
输出结果为
```
......
......@@ -19,6 +19,7 @@ from paddleslim.quant import quant_aware, quant_post, convert
import models
from utility import add_arguments, print_arguments
from paddle.common_ops_import import LayerHelper
from paddle.distributed import fleet
quantization_model_save_dir = './quantization_models/'
_logger = get_logger(__name__, level=logging.INFO)
......@@ -68,6 +69,7 @@ add_arg('analysis', bool, False,
add_arg('onnx_format', bool, False,
"Whether use onnx format or not.")
add_arg('ce_test', bool, False, "Whether to CE test.")
parser.add_argument('fleet', action='store_true', help="Whether to turn on distributed training.")
# yapf: enable
......@@ -120,6 +122,9 @@ def _prepare_envs():
def compress(args):
if args.fleet:
fleet.init(is_collective=True)
num_workers = 4
shuffle = True
if args.ce_test:
......@@ -166,20 +171,24 @@ def compress(args):
if not args.analysis:
learning_rate, opt = create_optimizer(args)
if args.fleet:
opt = fleet.distributed_optimizer(opt)
opt.minimize(avg_cost)
exe, places = _prepare_envs()
exe.run(paddle.static.default_startup_program())
sampler = paddle.io.DistributedBatchSampler(
train_dataset,
shuffle=shuffle,
drop_last=True,
batch_size=args.batch_size)
train_loader = paddle.io.DataLoader(
train_dataset,
places=places,
feed_list=[image, label],
drop_last=True,
return_list=False,
batch_size=args.batch_size,
batch_sampler=sampler,
use_shared_memory=True,
shuffle=shuffle,
num_workers=num_workers)
valid_loader = paddle.io.DataLoader(
......@@ -379,22 +388,12 @@ def compress(args):
batch_id += 1
lr.step()
build_strategy = paddle.static.BuildStrategy()
build_strategy.enable_inplace = False
build_strategy.fuse_all_reduce_ops = False
exec_strategy = paddle.static.ExecutionStrategy()
compiled_train_prog = compiled_train_prog.with_data_parallel(
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
# train loop
best_acc1 = 0.0
best_epoch = 0
start_epoch = 0
if args.checkpoint_dir is not None:
ckpt_path = args.checkpoint_dir
assert args.checkpoint_epoch is not None, "checkpoint_epoch must be set"
start_epoch = args.checkpoint_epoch
paddle.static.load(
......
......@@ -52,18 +52,7 @@ val_program = quant_aware(val_program, place, quant_config, scope=None, for_test
compiled_train_prog = quant_aware(train_prog, place, quant_config, scope=None, for_test=False)
```
### 关掉指定build策略
```
build_strategy = paddle.static.BuildStrategy()
build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False
exec_strategy = paddle.static.ExecutionStrategy()
compiled_train_prog = compiled_train_prog.with_data_parallel(
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
```
### 训练命令
......@@ -81,7 +70,8 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log -
--model MobileNet \
--pretrained_model ./pretrain/MobileNetV1_pretrained \
--checkpoint_dir ./output/mobilenetv1 \
--num_epochs 30
--num_epochs 30 \
--fleet
```
运行之后,可看到``best_model``的最后测试结果,和MobileNet量化前的精度top1=70.99%, top5=89.68%非常相近。
......@@ -15,6 +15,7 @@ from paddleslim.analysis import flops
from paddleslim.quant import quant_aware, convert
import paddle.vision.transforms as T
import models
from paddle.distributed import fleet
from utility import add_arguments, print_arguments
quantization_model_save_dir = './quantization_models/'
......@@ -41,6 +42,7 @@ add_arg('log_period', int, 10, "Log period in batches.")
add_arg('checkpoint_dir', str, "output", "checkpoint save dir")
add_arg('ce_test', bool, False, "Whether to CE test.")
add_arg('onnx_format', bool, False, "Whether to export the quantized model with format of ONNX.")
parser.add_argument('fleet', action='store_true', help="Whether to turn on distributed training.")
# yapf: enable
model_list = [m for m in dir(models) if "__" not in m]
......@@ -92,6 +94,9 @@ def _prepare_envs():
def compress(args):
if args.fleet:
fleet.init(is_collective=True)
num_workers = 4
shuffle = True
if args.ce_test:
......@@ -154,8 +159,8 @@ def compress(args):
raise ValueError("{} is not supported.".format(args.data))
image_shape = [int(m) for m in image_shape.split(",")]
assert args.model in model_list, "{} is not in lists: {}".format(args.model,
model_list)
assert args.model in model_list, "{} is not in lists: {}".format(
args.model, model_list)
image = paddle.static.data(
name='image', shape=[None] + image_shape, dtype='float32')
label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
......@@ -182,6 +187,8 @@ def compress(args):
compiled_train_prog = quant_aware(
train_prog, places, quant_config, scope=None, for_test=False)
opt = create_optimizer(args)
if args.fleet:
opt = fleet.distributed_optimizer(opt)
opt.minimize(avg_cost)
exe.run(paddle.static.default_startup_program())
......@@ -192,16 +199,18 @@ def compress(args):
if args.pretrained_model:
paddle.static.load(train_prog, args.pretrained_model, exe)
sampler = paddle.io.DistributedBatchSampler(
train_dataset,
shuffle=shuffle,
drop_last=True,
batch_size=args.batch_size)
train_loader = paddle.io.DataLoader(
train_dataset,
places=places,
feed_list=[image, label],
drop_last=True,
batch_size=args.batch_size,
batch_sampler=sampler,
return_list=False,
use_shared_memory=True,
shuffle=shuffle,
num_workers=num_workers)
valid_loader = paddle.io.DataLoader(
val_dataset,
......@@ -257,17 +266,6 @@ def compress(args):
end_time - start_time))
batch_id += 1
build_strategy = paddle.static.BuildStrategy()
build_strategy.memory_optimize = False
build_strategy.enable_inplace = False
build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False
exec_strategy = paddle.static.ExecutionStrategy()
compiled_train_prog = compiled_train_prog.with_data_parallel(
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
############################################################################################################
# train loop
############################################################################################################
......
......@@ -133,8 +133,8 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id, weight,
if int(os.getenv("CPU_NUM")) > 1:
build_strategy.reduce_strategy = paddle.static.BuildStrategy.ReduceStrategy.Reduce
program = paddle.static.CompiledProgram(train_program).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
program = paddle.static.CompiledProgram(
train_program, build_strategy=build_strategy)
for pass_id in range(args.num_passes):
py_reader.start()
......
......@@ -71,19 +71,23 @@ def test(test_exe, test_program, test_out, args):
for idx, data in enumerate(test_reader()):
res = []
res.append(
test_exe.run(test_program,
test_exe.run(
test_program,
feed={u'image_test': data[0][u'image_test1']},
fetch_list=out_feature))
res.append(
test_exe.run(test_program,
test_exe.run(
test_program,
feed={u'image_test': data[0][u'image_test2']},
fetch_list=out_feature))
res.append(
test_exe.run(test_program,
test_exe.run(
test_program,
feed={u'image_test': data[0][u'image_test3']},
fetch_list=out_feature))
res.append(
test_exe.run(test_program,
test_exe.run(
test_program,
feed={u'image_test': data[0][u'image_test4']},
fetch_list=out_feature))
featureL = np.concatenate((res[0][0], res[1][0]), 1)
......@@ -119,14 +123,12 @@ def train(exe, train_program, train_out, test_program, test_out, args):
build_strategy = paddle.static.BuildStrategy()
build_strategy.fuse_all_optimizer_ops = True
compiled_prog = paddle.static.CompiledProgram(
train_program, build_strategy=build_strategy).with_data_parallel(
loss_name=loss.name, build_strategy=build_strategy)
train_program, build_strategy=build_strategy)
best_ave = 0
for epoch_id in range(args.start_epoch, args.total_epoch):
for batch_id, data in enumerate(train_reader()):
loss, acc, global_lr = exe.run(compiled_prog,
feed=data,
fetch_list=fetch_list_train)
loss, acc, global_lr = exe.run(
compiled_prog, feed=data, fetch_list=fetch_list_train)
avg_loss = np.mean(np.array(loss))
avg_acc = np.mean(np.array(acc))
print(
......
......@@ -79,7 +79,7 @@ CUDA_VISIBLE_DEVICES=0 python train.py --batch_size 64 --data imagenet --lr 0.05
CUDA_VISIBLE_DEVICES=0 python train.py --batch_size 64 --data imagenet --lr 0.05 --pruning_mode ratio --ratio 0.55
```
GPU多卡训练:由于静态图多卡训练方式与非结构化稀疏中的mask逻辑存在兼容性问题,会在一定程度上影响训练精度,我们建议使用[Fleet](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/fleet_api_howto_cn.html)方式启动稀疏化多卡训练,实测精度与单卡一致。同时,为帮助开发者将`with_data_parallel`方式配置的分布式代码转换为`Fleet`我们在[示例代码](./train.py)里面也用`"Fleet step"`清晰标注出了用代码需要做的更改
GPU多卡训练:由于静态图多卡训练方式与非结构化稀疏中的mask逻辑存在兼容性问题,会在一定程度上影响训练精度,我们建议使用[Fleet](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/fleet_api_howto_cn.html)方式启动稀疏化多卡训练,[示例代码](./train.py)里面用`"Fleet step"`清晰标注出了设置流程
```bash
python -m paddle.distributed.launch \
--selected_gpus="0,1,2,3" \
......
......@@ -453,13 +453,6 @@ convert
#调用api
quant_train_program = quant.quant_aware(train_program, place, config, for_test=False)
quant_eval_program = quant.quant_aware(eval_program, place, config, for_test=True)
#关闭策略
build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False
quant_train_program = quant_train_program.with_data_parallel(
loss_name=avg_cost.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
inference_prog = quant.convert(quant_eval_program, place, config)
......
......@@ -23,7 +23,7 @@ from tqdm import tqdm
import numpy as np
import paddle
import paddle.nn as nn
from paddle.io import DataLoader
from paddle.io import DataLoader, DistributedBatchSampler
from imagenet_reader import ImageNetDataset
from paddleslim.common import load_config as load_slim_config
from paddleslim.auto_compression import AutoCompression
......@@ -101,7 +101,8 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
if len(test_feed_names) == 1:
image = np.array(image)
label = np.array(label).astype('int64')
pred = exe.run(compiled_test_program,
pred = exe.run(
compiled_test_program,
feed={test_feed_names[0]: image},
fetch_list=test_fetch_list)
pred = np.array(pred[0])
......@@ -120,11 +121,10 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
# eval "eval model", which inputs are image and label, output is top1 and top5 accuracy
image = np.array(image)
label = np.array(label).astype('int64')
result = exe.run(compiled_test_program,
feed={
test_feed_names[0]: image,
test_feed_names[1]: label
},
result = exe.run(
compiled_test_program,
feed={test_feed_names[0]: image,
test_feed_names[1]: label},
fetch_list=test_fetch_list)
result = [np.mean(r) for r in result]
results.append(result)
......@@ -148,13 +148,13 @@ def main():
global_config = all_config["Global"]
gpu_num = paddle.distributed.get_world_size()
if isinstance(all_config['TrainConfig']['learning_rate'],
dict) and all_config['TrainConfig']['learning_rate'][
'type'] == 'CosineAnnealingDecay':
if isinstance(
all_config['TrainConfig']['learning_rate'], dict
) and all_config['TrainConfig']['learning_rate']['type'] == 'CosineAnnealingDecay':
step = int(
math.ceil(
float(args.total_images) / (global_config['batch_size'] *
gpu_num)))
float(args.total_images) / (
global_config['batch_size'] * gpu_num)))
all_config['TrainConfig']['learning_rate']['T_max'] = step
print('total training steps:', step)
......@@ -171,13 +171,15 @@ def main():
data_dir=data_dir,
crop_size=img_size,
resize_size=resize_size)
train_loader = DataLoader(
batch_sampler = DistributedBatchSampler(
train_dataset,
places=[place],
batch_size=global_config['batch_size'],
shuffle=True,
drop_last=True,
drop_last=True)
train_loader = DataLoader(
train_dataset,
places=[place],
batch_sampler=batch_sampler,
num_workers=0)
global_config['input_name'] = get_feed_vars(
global_config['model_dir'], global_config['model_filename'],
......@@ -198,8 +200,7 @@ def main():
global_config['batch_size'],
crop_size=img_size,
resize_size=resize_size,
place=place),
global_config['input_name']))
place=place), global_config['input_name']))
ac.compress()
......
......@@ -174,11 +174,11 @@ def reader():
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type
): fn(samples)
train_batch_sampler = paddle.io.BatchSampler(
train_batch_sampler = paddle.io.DistributedBatchSampler(
train_ds, batch_size=global_config['batch_size'], shuffle=True)
[input_ids, token_type_ids, labels] = create_data_holder(global_config[
'task_name'])
[input_ids, token_type_ids,
labels] = create_data_holder(global_config['task_name'])
feed_list_name = []
train_data_loader = DataLoader(
dataset=train_ds,
......@@ -215,7 +215,8 @@ def reader():
def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
metric.reset()
for data in eval_dataloader():
logits = exe.run(compiled_test_program,
logits = exe.run(
compiled_test_program,
feed={
test_feed_names[0]: data[0]['input_ids'],
test_feed_names[1]: data[0]['token_type_ids']
......@@ -244,7 +245,8 @@ def eval():
metric.reset()
print('Evaluating...')
for data in eval_dataloader():
logits = exe.run(val_program,
logits = exe.run(
val_program,
feed={
feed_target_names[0]: data[0]['input_ids'],
feed_target_names[1]: data[0]['token_type_ids']
......
......@@ -181,8 +181,7 @@ def reader_proprecess(data_path, max_seq_len=512):
cur_result_list = []
for result in result_list:
if result['start'] + 1 <= max_content_len < result[
'end']:
if result['start'] + 1 <= max_content_len < result['end']:
max_content_len = result['start']
break
......@@ -276,7 +275,7 @@ def reader():
[input_ids, token_type_ids, start_ids, end_ids] = create_data_holder()
train_batch_sampler = paddle.io.BatchSampler(
train_batch_sampler = paddle.io.DistributedBatchSampler(
dataset=train_ds, batch_size=global_config['batch_size'], shuffle=True)
train_dataloader = paddle.io.DataLoader(
train_ds,
......@@ -299,7 +298,8 @@ def reader():
def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
metric.reset()
for data in eval_dataloader():
logits = exe.run(compiled_test_program,
logits = exe.run(
compiled_test_program,
feed={
'input_ids': data[0]['input_ids'],
'token_type_ids': data[0]['token_type_ids'],
......@@ -313,8 +313,8 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
start_prob = paddle.to_tensor(logits[0])
end_prob = paddle.to_tensor(logits[1])
num_correct, num_infer, num_label = metric.compute(start_prob, end_prob,
start_ids, end_ids)
num_correct, num_infer, num_label = metric.compute(
start_prob, end_prob, start_ids, end_ids)
metric.update(num_correct, num_infer, num_label)
paddle.enable_static()
precision, recall, f1 = metric.accumulate()
......
......@@ -120,17 +120,14 @@ def create_data_holder(task_name, input_names):
inputs = []
for name in input_names:
inputs.append(
paddle.static.data(
name=name, shape=[-1, -1], dtype="int64"))
paddle.static.data(name=name, shape=[-1, -1], dtype="int64"))
if task_name == "sts-b":
inputs.append(
paddle.static.data(
name="label", shape=[-1, 1], dtype="float32"))
paddle.static.data(name="label", shape=[-1, 1], dtype="float32"))
else:
inputs.append(
paddle.static.data(
name="label", shape=[-1, 1], dtype="int64"))
paddle.static.data(name="label", shape=[-1, 1], dtype="int64"))
return inputs
......@@ -164,7 +161,7 @@ def reader():
Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type
): fn(samples)
train_batch_sampler = paddle.io.BatchSampler(
train_batch_sampler = paddle.io.DistributedBatchSampler(
train_ds,
batch_size=global_config['batch_size'],
shuffle=True,
......@@ -257,7 +254,8 @@ def reader():
def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
metric.reset()
for data in eval_dataloader():
logits = exe.run(compiled_test_program,
logits = exe.run(
compiled_test_program,
feed={
test_feed_names[0]: data[0]['x0'],
test_feed_names[1]: data[0]['x1'],
......@@ -293,7 +291,8 @@ def eval():
metric.reset()
print('Evaluating...')
for data in eval_dataloader():
logits = exe.run(val_program,
logits = exe.run(
val_program,
feed={
feed_target_names[0]: data[0]['x0'],
feed_target_names[1]: data[0]['x1'],
......
......@@ -67,7 +67,8 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
ncols=80) as t:
for data in val_loader:
data_all = {k: np.array(v) for k, v in data.items()}
outs = exe.run(compiled_test_program,
outs = exe.run(
compiled_test_program,
feed={test_feed_names[0]: data_all['image']},
fetch_list=test_fetch_list,
return_numpy=False)
......@@ -93,12 +94,10 @@ def main():
paddle.vision.image.set_image_backend('cv2')
train_dataset = paddle.vision.datasets.ImageFolder(
global_config['image_path'], transform=yolo_image_preprocess)
batch_sampler = paddle.io.DistributedBatchSampler(
train_dataset, batch_size=1, shuffle=True, drop_last=True)
train_loader = paddle.io.DataLoader(
train_dataset,
batch_size=1,
shuffle=True,
drop_last=True,
num_workers=0)
train_dataset, batch_sampler=batch_sampler, num_workers=0)
train_loader = reader_wrapper(train_loader, input_name=input_name)
eval_func = None
else:
......@@ -107,8 +106,10 @@ def main():
image_dir=global_config['coco_train_image_dir'],
anno_path=global_config['coco_train_anno_path'],
input_name=input_name)
batch_sampler = paddle.io.DistributedBatchSampler(
dataset, batch_size=1, shuffle=True, drop_last=True)
train_loader = paddle.io.DataLoader(
dataset, batch_size=1, shuffle=True, drop_last=True, num_workers=0)
dataset, batch_size=1, num_workers=0, batch_sampler=batch_sampler)
if paddle.distributed.get_rank() == 0:
eval_func = eval_function
global val_loader
......
......@@ -21,7 +21,7 @@ from functools import partial
import numpy as np
import paddle
import paddle.nn as nn
from paddle.io import DataLoader
from paddle.io import DataLoader, DistributedBatchSampler
from imagenet_reader import ImageNetDataset
from paddleslim.common import load_config as load_slim_config
from paddleslim.auto_compression import AutoCompression
......@@ -72,7 +72,8 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
if len(test_feed_names) == 1:
image = np.array(image)
label = np.array(label).astype('int64')
pred = exe.run(compiled_test_program,
pred = exe.run(
compiled_test_program,
feed={test_feed_names[0]: image},
fetch_list=test_fetch_list)
pred = np.array(pred[0])
......@@ -114,13 +115,13 @@ def main():
data_dir = global_config['data_dir']
train_dataset = ImageNetDataset(mode='train', data_dir=data_dir)
train_loader = DataLoader(
batch_sampler = DistributedBatchSampler(
train_dataset,
batch_size=global_config['batch_size'],
shuffle=True,
drop_last=True,
num_workers=0)
drop_last=True)
train_loader = DataLoader(
train_dataset, batch_sampler=batch_sampler, num_workers=0)
train_dataloader = reader_wrapper(train_loader, global_config['input_name'])
ac = AutoCompression(
......
......@@ -23,7 +23,7 @@ from tqdm import tqdm
import numpy as np
import paddle
import paddle.nn as nn
from paddle.io import DataLoader
from paddle.io import DataLoader, DistributedBatchSampler
from imagenet_reader import ImageNetDataset
from paddleslim.common import load_config as load_slim_config
from paddleslim.auto_compression import AutoCompression
......@@ -97,7 +97,8 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
if len(test_feed_names) == 1:
image = np.array(image)
label = np.array(label).astype('int64')
pred = exe.run(compiled_test_program,
pred = exe.run(
compiled_test_program,
feed={test_feed_names[0]: image},
fetch_list=test_fetch_list)
pred = np.array(pred[0])
......@@ -116,11 +117,10 @@ def eval_function(exe, compiled_test_program, test_feed_names, test_fetch_list):
# eval "eval model", which inputs are image and label, output is top1 and top5 accuracy
image = np.array(image)
label = np.array(label).astype('int64')
result = exe.run(compiled_test_program,
feed={
test_feed_names[0]: image,
test_feed_names[1]: label
},
result = exe.run(
compiled_test_program,
feed={test_feed_names[0]: image,
test_feed_names[1]: label},
fetch_list=test_fetch_list)
result = [np.mean(r) for r in result]
results.append(result)
......@@ -144,13 +144,13 @@ def main():
global_config = all_config["Global"]
gpu_num = paddle.distributed.get_world_size()
if isinstance(all_config['TrainConfig']['learning_rate'],
dict) and all_config['TrainConfig']['learning_rate'][
'type'] == 'CosineAnnealingDecay':
if isinstance(
all_config['TrainConfig']['learning_rate'], dict
) and all_config['TrainConfig']['learning_rate']['type'] == 'CosineAnnealingDecay':
step = int(
math.ceil(
float(args.total_images) / (global_config['batch_size'] *
gpu_num)))
float(args.total_images) / (
global_config['batch_size'] * gpu_num)))
all_config['TrainConfig']['learning_rate']['T_max'] = step
print('total training steps:', step)
......@@ -167,13 +167,15 @@ def main():
data_dir=data_dir,
crop_size=img_size,
resize_size=resize_size)
train_loader = DataLoader(
batch_sampler = DistributedBatchSampler(
train_dataset,
places=[place],
batch_size=global_config['batch_size'],
shuffle=True,
drop_last=True,
drop_last=True)
train_loader = DataLoader(
train_dataset,
places=[place],
batch_sampler=batch_sampler,
num_workers=0)
train_dataloader = reader_wrapper(train_loader, global_config['input_name'])
......@@ -191,8 +193,7 @@ def main():
global_config['batch_size'],
crop_size=img_size,
resize_size=resize_size,
place=place),
global_config['input_name']))
place=place), global_config['input_name']))
ac.compress()
......
......@@ -157,10 +157,10 @@ class AutoCompression:
self.deploy_hardware = deploy_hardware
paddle.enable_static()
self._exe, self._places = self._prepare_envs()
self._exe, self._places, self.fleet = self._prepare_envs()
self.default_distill_node_pair, self.model_type = self._get_model_info()
if self.train_config is not None and self.train_config.use_fleet:
if self.fleet:
fleet.init(is_collective=True)
if with_variable_shape(
......@@ -306,7 +306,8 @@ class AutoCompression:
places = paddle.device._convert_to_place(devices)
_logger.info(f"devices: {devices}")
exe = paddle.static.Executor(places)
return exe, places
fleet = paddle.device.cuda.device_count() > 1
return exe, places, fleet
def _get_model_info(self):
[inference_program, _, _] = (load_inference_model(
......@@ -443,9 +444,8 @@ class AutoCompression:
return strategy, config
def _prepare_fleet_strategy(train_config):
def _prepare_fleet_strategy(self, train_config):
build_strategy = paddle.static.BuildStrategy()
exec_strategy = paddle.static.ExecutionStrategy()
strategy = fleet.DistributedStrategy()
strategy.build_strategy = build_strategy
......@@ -458,8 +458,6 @@ class AutoCompression:
if train_config.amp_config is not None:
strategy.amp = True
strategy.amp_configs = {**train_config.amp_config}
if train_config.asp_config is not None:
strategy.asp = True
return strategy
def _prepare_program(self, program, feed_target_names, fetch_targets,
......@@ -498,7 +496,7 @@ class AutoCompression:
self._exe, self._places, config_dict, train_program_info,
strategy, patterns, self.eval_dataloader)
if train_config.use_fleet:
if self.fleet:
dist_strategy = self._prepare_fleet_strategy(train_config)
else:
dist_strategy = None
......@@ -534,7 +532,7 @@ class AutoCompression:
self._exe.run(train_program_info.startup_program)
if (not train_config.use_fleet) and train_config.amp_config is not None:
if (not self.fleet) and train_config.amp_config is not None:
if hasattr(
train_config.amp_config,
'use_pure_fp16') and train_config.amp_config.use_pure_fp16:
......@@ -545,7 +543,7 @@ class AutoCompression:
### prune weight in scope
self._pruner.prune_model(train_program_info.program)
if not train_config.use_fleet:
if not self.fleet:
train_program_info = self._compiled_program(train_program_info,
strategy)
test_program_info = self._compiled_program(test_program_info,
......@@ -553,19 +551,16 @@ class AutoCompression:
return train_program_info, test_program_info
def _compiled_program(self, program_info, strategy):
compiled_prog = paddle.static.CompiledProgram(program_info.program)
build_strategy = paddle.static.BuildStrategy()
exec_strategy = paddle.static.ExecutionStrategy()
if 'qat' in strategy:
build_strategy.memory_optimize = False
build_strategy.enable_inplace = False
build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False
compiled_prog = compiled_prog.with_data_parallel(
loss_name=program_info.fetch_targets[0].name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
compiled_prog = paddle.static.CompiledProgram(
program_info.program, build_strategy=build_strategy)
program_info.program = compiled_prog
return program_info
......@@ -823,8 +818,12 @@ class AutoCompression:
test_program_info.fetch_targets)
if metric > best_metric:
tmp_program = test_program_info.program._program if isinstance(
test_program_info.program,
paddle.static.CompiledProgram
) else test_program_info.program
paddle.static.save(
program=test_program_info.program._program,
program=tmp_program,
model_path=os.path.join(self.tmp_dir,
'best_model'))
best_metric = metric
......
......@@ -346,7 +346,7 @@ def build_distill_program(executor,
with paddle.utils.unique_name.guard('merge'):
optimizer, learning_rate = _create_optimizer(train_config)
if train_config.get('use_fleet'):
if dist_strategy is not None:
optimizer = fleet.distributed_optimizer(optimizer,
dist_strategy)
else:
......@@ -385,8 +385,7 @@ def build_distill_program(executor,
loss.stop_gradient = False
if 'prune_params_name' in config: ### prune
if 'pruned_ratio' not in config and not train_config.get(
'use_fleet'): ### asp
if 'pruned_ratio' not in config and dist_strategy is None: ### asp
optimizer = pruner.decorate(optimizer)
optimizer.minimize(loss)
elif 'prune_strategy' in config: ###unstructure prune
......
......@@ -339,7 +339,6 @@ class TrainConfig:
logging_iter=10,
origin_metric=None,
target_metric=None,
use_fleet=False,
amp_config=None,
recompute_config=None,
sharding_config=None,
......@@ -372,8 +371,7 @@ class TrainConfig:
logging_iter(int): Log period in batches. Default: 10.
origin_metric(float, optional): The Metric of model before compress, used to check whether the dataloader is correct if is not None. Default: None.
target_metric(float, optional): The Metric of model after compress, if set target metric, the metric of compressed model satisfy the requirements, will be stop training. If not set, will train epochs as users set. Default: None.
use_fleet(bool): Whether to use fleet. Default: False.
amp_config(dict, optional): The dictionary contains all the configs of amp. Default: None. The detailed description is as below if use_fleet=False:
amp_config(dict, optional): The dictionary contains all the configs of amp. Default: None. The detailed description is as below when turning on distributed training:
.. code-block:: python
AMP-O1 `<https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/amp_cn.html#id2>`_ :
{'custom_white_list', set} # The custom white_list. It's the set of ops that support
......@@ -389,10 +387,10 @@ class TrainConfig:
{'use_fp16_guard': bool} # Whether to use `fp16_guard` when constructing the program.
..
If you want to use AMP-O2, you need to set use_pure_fp16 is True and use_fp16_guard is False.
If use_fleet=True, the key of amp_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#amp-configs>`_.
when turning on distributed training, the key of amp_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#amp-configs>`_.
recompute_config(dict, optional): The dictionary contains all the configs of recompute. Default: None. The recompute config only can be set when use_fleet=True, the key of recompute_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#recompute-configs>`_.
sharding_config(dict, optional): The dictionary contains all the configs of sharding. Default: None. The sharding config only can be set when use_fleet=True, the key of sharding_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#sharding-configs>`_.
recompute_config(dict, optional): The dictionary contains all the configs of recompute. Default: None. The recompute config only can be set when turning on distributed training, the key of recompute_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#recompute-configs>`_.
sharding_config(dict, optional): The dictionary contains all the configs of sharding. Default: None. The sharding config only can be set when turning on distributed training, the key of sharding_config can reference `<https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/fleet/DistributedStrategy_cn.html#sharding-configs>`_.
sparse_model(bool, optional): Set sparse_model to ``True`` to remove mask tensor when the compress strategy is unstructure prune. Default: False.
"""
self.epochs = epochs
......@@ -403,7 +401,6 @@ class TrainConfig:
self.logging_iter = logging_iter
self.origin_metric = origin_metric
self.target_metric = target_metric
self.use_fleet = use_fleet
self.amp_config = amp_config
self.recompute_config = recompute_config
self.sharding_config = sharding_config
......
......@@ -103,13 +103,11 @@ class VarCollector(object):
def run(self, reader, exe, step=None, loss_name=None):
if not hasattr(self.program, '_program'):
# Compile the native program to speed up
program = paddle.static.CompiledProgram(
self.program).with_data_parallel(loss_name=loss_name)
program = paddle.static.CompiledProgram(self.program)
for idx, data in enumerate(reader):
vars_np = exe.run(program=program,
feed=data,
fetch_list=self.real_names)
vars_np = exe.run(
program=program, feed=data, fetch_list=self.real_names)
mapped_vars_np = dict(zip(self.real_names, vars_np))
values = self.update(mapped_vars_np)
......@@ -129,8 +127,7 @@ class VarCollector(object):
if not hasattr(self.program, '_program'):
# Compile the native program to speed up
program = paddle.static.CompiledProgram(
self.program).with_data_parallel(loss_name=loss_name)
program = paddle.static.CompiledProgram(self.program)
for idx, data in enumerate(reader):
vars_np = exe.run(program=program, feed=data, fetch_list=fetch_list)
vars_np = [np.max(var) for var in vars_np]
......@@ -172,5 +169,5 @@ class VarCollector(object):
plt.show()
pdf.savefig()
plt.close()
_logger.info("variables histogram have been saved as {}".format(
pdf_path))
_logger.info(
"variables histogram have been saved as {}".format(pdf_path))
......@@ -107,17 +107,14 @@ def _parse_train_configs(train_config):
def _compile_program(program, fetch_var_name):
"""compiling program"""
compiled_prog = paddle.static.CompiledProgram(program)
build_strategy = paddle.static.BuildStrategy()
build_strategy.memory_optimize = False
build_strategy.enable_inplace = False
build_strategy.fuse_all_reduce_ops = False
build_strategy.sync_batch_norm = False
exec_strategy = paddle.static.ExecutionStrategy()
compiled_prog = compiled_prog.with_data_parallel(
loss_name=fetch_var_name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
compiled_prog = paddle.static.CompiledProgram(
program, build_strategy=build_strategy)
return compiled_prog
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册