未验证 提交 64f9442b 编写于 作者: B Bai Yifan 提交者: GitHub

Merge branch 'develop' into pact_clip

...@@ -159,7 +159,7 @@ compiled_train_prog = compiled_train_prog.with_data_parallel( ...@@ -159,7 +159,7 @@ compiled_train_prog = compiled_train_prog.with_data_parallel(
普通量化: 普通量化:
``` ```
python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/MobileNetV3_large_x1_0_ssld_pretrained --checkpoint_dir ./output/MobileNetV3_large_x1_0 --num_epochs 30 --lr 0.0001 --use_pact False python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/MobileNetV3_large_x1_0_ssld_pretrained --num_epochs 30 --lr 0.0001 --use_pact False
``` ```
...@@ -179,7 +179,7 @@ python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/Mob ...@@ -179,7 +179,7 @@ python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/Mob
使用PACT量化训练 使用PACT量化训练
``` ```
python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/MobileNetV3_large_x1_0_ssld_pretrained --checkpoint_dir ./output/MobileNetV3_large_x1_0 --num_epochs 30 --lr 0.0001 --use_pact True --batch_size 128 --lr_strategy=piecewise_decay --step_epochs 20 --l2_decay 1e-5 python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/MobileNetV3_large_x1_0_ssld_pretrained --num_epochs 30 --lr 0.0001 --use_pact True --batch_size 128 --lr_strategy=piecewise_decay --step_epochs 20 --l2_decay 1e-5
``` ```
输出结果为 输出结果为
......
...@@ -55,8 +55,12 @@ add_arg('data', str, "imagenet", ...@@ -55,8 +55,12 @@ add_arg('data', str, "imagenet",
"Which data to use. 'mnist' or 'imagenet'") "Which data to use. 'mnist' or 'imagenet'")
add_arg('log_period', int, 10, add_arg('log_period', int, 10,
"Log period in batches.") "Log period in batches.")
add_arg('checkpoint_dir', str, "output", add_arg('checkpoint_dir', str, None,
"checkpoint save dir") "checkpoint dir")
add_arg('checkpoint_epoch', int, None,
"checkpoint epoch")
add_arg('output_dir', str, "output/MobileNetV3_large_x1_0",
"model save dir")
add_arg('use_pact', bool, True, add_arg('use_pact', bool, True,
"Whether to use PACT or not.") "Whether to use PACT or not.")
...@@ -288,6 +292,7 @@ def compress(args): ...@@ -288,6 +292,7 @@ def compress(args):
compiled_train_prog, compiled_train_prog,
feed=data, feed=data,
fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name])
end_time = time.time() end_time = time.time()
loss_n = np.mean(loss_n) loss_n = np.mean(loss_n)
acc_top1_n = np.mean(acc_top1_n) acc_top1_n = np.mean(acc_top1_n)
...@@ -322,24 +327,37 @@ def compress(args): ...@@ -322,24 +327,37 @@ def compress(args):
# train loop # train loop
best_acc1 = 0.0 best_acc1 = 0.0
best_epoch = 0 best_epoch = 0
for i in range(args.num_epochs):
start_epoch = 0
if args.checkpoint_dir is not None:
ckpt_path = args.checkpoint_dir
assert args.checkpoint_epoch is not None, "checkpoint_epoch must be set"
start_epoch = args.checkpoint_epoch
fluid.io.load_persistables(
exe, dirname=args.checkpoint_dir, main_program=val_program)
start_step = start_epoch * int(
math.ceil(float(args.total_images) / args.batch_size))
v = fluid.global_scope().find_var('@LR_DECAY_COUNTER@').get_tensor()
v.set(np.array([start_step]).astype(np.float32), place)
for i in range(start_epoch, args.num_epochs):
train(i, compiled_train_prog) train(i, compiled_train_prog)
acc1 = test(i, val_program) acc1 = test(i, val_program)
fluid.io.save_persistables( fluid.io.save_persistables(
exe, exe,
dirname=os.path.join(args.checkpoint_dir, str(i)), dirname=os.path.join(args.output_dir, str(i)),
main_program=val_program) main_program=val_program)
if acc1 > best_acc1: if acc1 > best_acc1:
best_acc1 = acc1 best_acc1 = acc1
best_epoch = i best_epoch = i
fluid.io.save_persistables( fluid.io.save_persistables(
exe, exe,
dirname=os.path.join(args.checkpoint_dir, 'best_model'), dirname=os.path.join(args.output_dir, 'best_model'),
main_program=val_program) main_program=val_program)
if os.path.exists(os.path.join(args.checkpoint_dir, 'best_model')): if os.path.exists(os.path.join(args.output_dir, 'best_model')):
fluid.io.load_persistables( fluid.io.load_persistables(
exe, exe,
dirname=os.path.join(args.checkpoint_dir, 'best_model'), dirname=os.path.join(args.output_dir, 'best_model'),
main_program=val_program) main_program=val_program)
# 3. Freeze the graph after training by adjusting the quantize # 3. Freeze the graph after training by adjusting the quantize
......
...@@ -126,14 +126,14 @@ test(val_quant_program) ...@@ -126,14 +126,14 @@ test(val_quant_program)
```python ```python
float_prog, int8_prog = slim.quant.convert(val_quant_program, exe.place, save_int8=True) float_prog, int8_prog = slim.quant.convert(val_quant_program, exe.place, save_int8=True)
target_vars = [float_prog.global_block().var(name) for name in outputs] target_vars = [float_prog.global_block().var(outputs[-1])]
fluid.io.save_inference_model(dirname='./inference_model/float', fluid.io.save_inference_model(dirname='./inference_model/float',
feeded_var_names=[var.name for var in inputs], feeded_var_names=[inputs[0].name],
target_vars=target_vars, target_vars=target_vars,
executor=exe, executor=exe,
main_program=float_prog) main_program=float_prog)
fluid.io.save_inference_model(dirname='./inference_model/int8', fluid.io.save_inference_model(dirname='./inference_model/int8',
feeded_var_names=[var.name for var in inputs], feeded_var_names=[inputs[0].name],
target_vars=target_vars, target_vars=target_vars,
executor=exe, executor=exe,
main_program=int8_prog) main_program=int8_prog)
......
# Intel CPU上部署量化模型教程 # Intel CPU上部署量化模型教程
在Intel Casecade Lake机器上(如:Intel(R) Xeon(R) Gold 6271),经过量化和DNNL加速,INT8模型在单线程上性能为FP32模型的3~3.7倍;在Intel SkyLake机器上(如:Intel(R) Xeon(R) Gold 6148),单线程性能为FP32模型的1.5倍,而精度仅有极小下降。图像分类量化的样例教程请参考[图像分类INT8模型在CPU优化部署和预测](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/mkldnn_quant/README.md)。自然语言处理模型的量化请参考[ERNIE INT8 模型精度与性能复现](https://github.com/PaddlePaddle/benchmark/tree/master/Inference/c%2B%2B/ernie/mkldnn) 在Intel Casecade Lake机器上(如:Intel(R) Xeon(R) Gold 6271),经过量化和DNNL加速,INT8模型在单线程上性能为FP32模型的3~3.7倍;在Intel SkyLake机器上(如:Intel(R) Xeon(R) Gold 6148),单线程性能为FP32模型的1.5倍,而精度仅有极小下降。图像分类量化的样例教程请参考[图像分类INT8模型在CPU优化部署和预测](https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/mkldnn_quant/)。自然语言处理模型的量化请参考[ERNIE INT8 模型精度与性能复现](https://github.com/PaddlePaddle/benchmark/tree/master/Inference/c%2B%2B/ernie/mkldnn)
## 图像分类INT8模型在 Xeon(R) 6271 上的精度和性能 ## 图像分类INT8模型在 Xeon(R) 6271 上的精度和性能
......
...@@ -29,4 +29,4 @@ def image_classification(model, image_shape, class_num, use_gpu=False): ...@@ -29,4 +29,4 @@ def image_classification(model, image_shape, class_num, use_gpu=False):
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
return exe, train_program, val_program, (image, label), ( return exe, train_program, val_program, (image, label), (
acc_top1.name, acc_top5.name, avg_cost.name) acc_top1.name, acc_top5.name, avg_cost.name, out.name)
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
from ..core import GraphWrapper from ..core import GraphWrapper
from .prune_walker import conv2d as conv2d_walker from .prune_walker import PRUNE_WORKER
__all__ = ["collect_convs"] __all__ = ["collect_convs"]
...@@ -55,8 +55,9 @@ def collect_convs(params, graph, visited={}): ...@@ -55,8 +55,9 @@ def collect_convs(params, graph, visited={}):
pruned_params = [] pruned_params = []
param = graph.var(param) param = graph.var(param)
conv_op = param.outputs()[0] conv_op = param.outputs()[0]
walker = conv2d_walker(
conv_op, pruned_params=pruned_params, visited=visited) cls = PRUNE_WORKER.get(conv_op.type())
walker = cls(conv_op, pruned_params=pruned_params, visited=visited)
walker.prune(param, pruned_axis=0, pruned_idx=[0]) walker.prune(param, pruned_axis=0, pruned_idx=[0])
groups.append(pruned_params) groups.append(pruned_params)
visited = set() visited = set()
......
...@@ -84,9 +84,7 @@ class PruneWorker(object): ...@@ -84,9 +84,7 @@ class PruneWorker(object):
cls = PRUNE_WORKER.get("default_walker") cls = PRUNE_WORKER.get("default_walker")
_logger.debug("\nfrom: {}\nto: {}\npruned_axis: {}; var: {}".format( _logger.debug("\nfrom: {}\nto: {}\npruned_axis: {}; var: {}".format(
self.op, op, pruned_axis, var.name())) self.op, op, pruned_axis, var.name()))
walker = cls(op, walker = cls(op, pruned_params=self.pruned_params, visited=self.visited)
pruned_params=self.pruned_params,
visited=self.visited)
walker.prune(var, pruned_axis, pruned_idx) walker.prune(var, pruned_axis, pruned_idx)
...@@ -175,29 +173,8 @@ class conv2d_transpose(PruneWorker): ...@@ -175,29 +173,8 @@ class conv2d_transpose(PruneWorker):
self._prune_op(op, filter_var, 0, pruned_idx) self._prune_op(op, filter_var, 0, pruned_idx)
elif var in self.op.inputs("Filter"): elif var in self.op.inputs("Filter"):
assert pruned_axis in [0, 1] _logger.warn("Skip pruning output channels of conv2d_transpose!")
return
self.pruned_params.append((var, pruned_axis, pruned_idx))
for op in var.outputs():
self._prune_op(op, var, pruned_axis, pruned_idx)
if pruned_axis == 1:
if len(self.op.inputs("Bias")) > 0:
self.pruned_params.append(
(self.op.inputs("Bias"), channel_axis, pruned_idx))
output_var = self.op.outputs("Output")[0]
self._visit(output_var, channel_axis)
next_ops = output_var.outputs()
for op in next_ops:
self._prune_op(op, output_var, channel_axis, pruned_idx)
elif pruned_axis == 0:
input_var = self.op.inputs("Input")[0]
self._visit(input_var, channel_axis)
pre_ops = input_var.inputs()
for op in pre_ops:
self._prune_op(op, input_var, channel_axis, pruned_idx)
elif var in self.op.outputs("Output"): elif var in self.op.outputs("Output"):
assert pruned_axis == channel_axis, "pruned_axis: {}; var: {}".format( assert pruned_axis == channel_axis, "pruned_axis: {}; var: {}".format(
pruned_axis, var.name()) pruned_axis, var.name())
......
...@@ -41,6 +41,9 @@ class TestPrune(unittest.TestCase): ...@@ -41,6 +41,9 @@ class TestPrune(unittest.TestCase):
conv5 = conv_bn_layer(sum2, 8, 3, "conv5") conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
conv6 = conv_bn_layer(conv5, 8, 3, "conv6") conv6 = conv_bn_layer(conv5, 8, 3, "conv6")
conv7 = fluid.layers.conv2d_transpose(
input=conv6, num_filters=16, filter_size=2, stride=2)
shapes = {} shapes = {}
for param in main_program.global_block().all_parameters(): for param in main_program.global_block().all_parameters():
shapes[param.name] = param.shape shapes[param.name] = param.shape
...@@ -53,8 +56,8 @@ class TestPrune(unittest.TestCase): ...@@ -53,8 +56,8 @@ class TestPrune(unittest.TestCase):
main_program, _, _ = pruner.prune( main_program, _, _ = pruner.prune(
main_program, main_program,
scope, scope,
params=["conv4_weights"], params=["conv4_weights", "conv2d_transpose_0.w_0"],
ratios=[0.5], ratios=[0.5, 0.6],
place=place, place=place,
lazy=False, lazy=False,
only_graph=False, only_graph=False,
...@@ -67,11 +70,12 @@ class TestPrune(unittest.TestCase): ...@@ -67,11 +70,12 @@ class TestPrune(unittest.TestCase):
"conv3_weights": (8, 4, 3, 3), "conv3_weights": (8, 4, 3, 3),
"conv4_weights": (4, 8, 3, 3), "conv4_weights": (4, 8, 3, 3),
"conv5_weights": (8, 4, 3, 3), "conv5_weights": (8, 4, 3, 3),
"conv6_weights": (8, 8, 3, 3) "conv6_weights": (8, 8, 3, 3),
"conv2d_transpose_0.w_0": (8, 16, 2, 2),
} }
for param in main_program.global_block().all_parameters(): for param in main_program.global_block().all_parameters():
if "weights" in param.name: if param.name in shapes:
print("param: {}; param shape: {}".format(param.name, print("param: {}; param shape: {}".format(param.name,
param.shape)) param.shape))
self.assertTrue(param.shape == shapes[param.name]) self.assertTrue(param.shape == shapes[param.name])
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册