未验证 提交 c9c0e83f 编写于 作者: C cc 提交者: GitHub

Add int16 quantization for embedding (#857)

上级 4d75cb9c
...@@ -461,12 +461,27 @@ fluid.Program ...@@ -461,12 +461,27 @@ fluid.Program
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
# 量化为8比特,Embedding参数的体积减小4倍,精度有轻微损失
config = { config = {
'quantize_op_types': ['lookup_table'], 'quantize_op_types': ['lookup_table'],
'lookup_table': { 'lookup_table': {
'quantize_type': 'abs_max' 'quantize_type': 'abs_max',
'quantize_bits': 8,
'dtype': 'int8'
} }
} }
'''
# 量化为16比特,Embedding参数的体积减小2倍,精度损失很小
config = {
'quantize_op_types': ['lookup_table'],
'lookup_table': {
'quantize_type': 'abs_max',
'quantize_bits': 16,
'dtype': 'int16'
}
}
'''
quant_program = quant.quant_embedding(infer_program, place, config) quant_program = quant.quant_embedding(infer_program, place, config)
更详细的用法请参考 `Embedding量化demo <https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/quant/quant_embedding>`_ 更详细的用法请参考 `Embedding量化demo <https://github.com/PaddlePaddle/PaddleSlim/tree/develop/demo/quant/quant_embedding>`_
......
# Embedding量化 # Embedding量化
Embedding量化将网络中的Embedding参数从`float32`类型量化到 `8-bit`整数类型,在几乎不损失模型精度的情况下减少模型的存储空间和显存占用。 Embedding量化将网络中的Embedding参数从`float32`类型量化到 `8-bit`或者 `16-bit` 整数类型,在几乎不损失模型精度的情况下减少模型的存储空间和显存占用。
Embedding量化仅能减少模型参数的体积,加快加载Embedding参数的速度,并不能显著提升模型预测速度。
Embedding量化仅能减少模型参数的体积,并不能显著提升模型预测速度。
## 使用方法 ## 使用方法
在预测时调用paddleslim `quant_embedding`接口,主要实现代码如下: 在预测时调用paddleslim `quant_embedding`接口,主要实现代码如下:
...@@ -29,12 +30,28 @@ place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() ...@@ -29,12 +30,28 @@ place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
# 量化为8比特,Embedding参数的体积减小4倍,精度有轻微损失
config = {
'quantize_op_types': ['lookup_table'],
'lookup_table': {
'quantize_type': 'abs_max',
'quantize_bits': 8,
'dtype': 'int8'
}
}
'''
# 量化为16比特,Embedding参数的体积减小2倍,精度损失很小
config = { config = {
'quantize_op_types': ['lookup_table'], 'quantize_op_types': ['lookup_table'],
'lookup_table': { 'lookup_table': {
'quantize_type': 'abs_max' 'quantize_type': 'abs_max',
'quantize_bits': 16,
'dtype': 'int16'
} }
} }
'''
quant_program = quant.quant_embedding(infer_program, place, config) quant_program = quant.quant_embedding(infer_program, place, config)
``` ```
......
...@@ -35,10 +35,13 @@ _default_single_config = { ...@@ -35,10 +35,13 @@ _default_single_config = {
"quantize_bits": 8, "quantize_bits": 8,
"dtype": "int8" "dtype": "int8"
} }
SUPPORT_OP_TYPES = ['lookup_table', 'fused_embedding_seq_pool', 'pyramid_hash'] SUPPORT_OP_TYPES = [
'lookup_table', 'lookup_table_v2', 'fused_embedding_seq_pool',
'pyramid_hash'
]
SUPPORT_QUANTIZE_TYPES = ['abs_max', 'log'] SUPPORT_QUANTIZE_TYPES = ['abs_max', 'log']
SUPPORT_QUANTIZE_BITS = [8] SUPPORT_QUANTIZE_BITS = [8, 16]
SUPPORT_DTYPE = ['int8'] SUPPORT_DTYPE = ['int8', 'int16']
_default_config = {"quantize_op_types": SUPPORT_OP_TYPES, } _default_config = {"quantize_op_types": SUPPORT_OP_TYPES, }
...@@ -125,7 +128,7 @@ def _get_quant_var_name(var_name): ...@@ -125,7 +128,7 @@ def _get_quant_var_name(var_name):
""" """
get quantized var name get quantized var name
""" """
return var_name + '.int8' return var_name + '.int'
def _get_dequant_var_name(var_name): def _get_dequant_var_name(var_name):
...@@ -151,6 +154,11 @@ def _clear_var(var_name, scope): ...@@ -151,6 +154,11 @@ def _clear_var(var_name, scope):
tensor._clear() tensor._clear()
def _get_var_dtype(config):
return core.VarDesc.VarType.INT8 if config['dtype'] == 'int8' \
else core.VarDesc.VarType.INT16
def _quant_embedding_abs_max(graph, scope, place, config, var_name, def _quant_embedding_abs_max(graph, scope, place, config, var_name,
embedding_node): embedding_node):
""" """
...@@ -230,7 +238,7 @@ def _quant_embedding_abs_max(graph, scope, place, config, var_name, ...@@ -230,7 +238,7 @@ def _quant_embedding_abs_max(graph, scope, place, config, var_name,
_get_quant_var_name(var_name), _get_quant_var_name(var_name),
var_type=embedding_node.type(), var_type=embedding_node.type(),
shape=embedding_node.shape(), shape=embedding_node.shape(),
var_dtype=core.VarDesc.VarType.INT8) var_dtype=_get_var_dtype(config))
# create var in scope # create var in scope
scope.var(_get_quant_var_name(var_name)) scope.var(_get_quant_var_name(var_name))
scope.var(_get_scale_var_name(var_name)) scope.var(_get_scale_var_name(var_name))
......
...@@ -21,15 +21,28 @@ from static_case import StaticCase ...@@ -21,15 +21,28 @@ from static_case import StaticCase
class TestQuantEmbedding(StaticCase): class TestQuantEmbedding(StaticCase):
def set_config(self):
self.config = {
'quantize_op_types': ['lookup_table_v2'],
'lookup_table': {
'quantize_type': 'abs_max',
'quantize_bits': 8,
'dtype': 'int8'
}
}
def test_quant_embedding(self): def test_quant_embedding(self):
self.set_config()
train_program = paddle.static.Program() train_program = paddle.static.Program()
with paddle.static.program_guard(train_program): startup_program = paddle.static.Program()
with paddle.static.program_guard(train_program, startup_program):
input_word = paddle.static.data( input_word = paddle.static.data(
name="input_word", shape=[None, 1], dtype='int64') name="input_word", shape=[None, 1], dtype='int64')
param_attr = paddle.ParamAttr( param_attr = paddle.ParamAttr(
name='emb', name='emb',
initializer=paddle.nn.initializer.Uniform(-0.005, 0.005)) initializer=paddle.nn.initializer.Uniform(-0.005, 0.005))
weight = train_program.global_block().create_parameter( weight = paddle.static.create_parameter(
(100, 128), attr=param_attr, dtype="float32") (100, 128), attr=param_attr, dtype="float32")
input_emb = paddle.nn.functional.embedding( input_emb = paddle.nn.functional.embedding(
...@@ -37,13 +50,24 @@ class TestQuantEmbedding(StaticCase): ...@@ -37,13 +50,24 @@ class TestQuantEmbedding(StaticCase):
infer_program = train_program.clone(for_test=True) infer_program = train_program.clone(for_test=True)
use_gpu = True place = paddle.CPUPlace()
place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
exe = paddle.static.Executor(place) exe = paddle.static.Executor(place)
exe.run(paddle.static.default_startup_program()) exe.run(startup_program)
quant_program = quant.quant_embedding(infer_program, place) quant_program = quant.quant_embedding(infer_program, place)
class TestQuantEmbeddingInt16(TestQuantEmbedding):
def set_config(self):
self.config = {
'quantize_op_types': ['lookup_table'],
'lookup_table': {
'quantize_type': 'abs_max',
'quantize_bits': 16,
'dtype': 'int16'
}
}
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册