diff --git a/docs/benchmark/benchmark.md b/docs/benchmark/benchmark.md index 7ed1ae1f637d472a6d19ae4c44314268241810a3..fab2689f87482419e986526d54b6fbc7a17806b9 100644 --- a/docs/benchmark/benchmark.md +++ b/docs/benchmark/benchmark.md @@ -144,7 +144,7 @@ mobilenet_v2 |48.60 |25.43 |13.76 |43.06 |22.10 |12.09 | * HUAWEI P40, Kirin 990 5G * 2 x Cortex-A76 Based 2.86GHz + 2 x Cortex-A76 Based 2.36GHz + 4 x Cortex-A55 1.95GHz -* HIAI ddk 版本: 310 +* HIAI ddk 版本: 310 or 320 * 测试说明 * branch: release/v2.6.1 @@ -156,10 +156,23 @@ mobilenet_v2 |48.60 |25.43 |13.76 |43.06 |22.10 |12.09 | #### paddlepaddle model +- ddk 310 + |Kirin |810||990||990 5G|| |---|---|---|---|---|---|---| -||cpu(ms) | npu(ms) |cpu(ms) | npu(ms) |cpu(ms) | npu(ms) | -|mobilenet_v1| 33.84| 3.10| 31.91| 4.07| 33.97| 3.20| -|mobilenet_v2| 23.32| 3.51| 22.47| 5.61| 23.17| 3.51| -|squeezenet| 18.47| 4.35| 17.79| 5.05| 18.65| 3.47| -|mnasnet| 20.24| 3.28| 19.54| 5.17| 20.34| 3.32| +| |cpu(ms) | npu(ms) |cpu(ms) | npu(ms) |cpu(ms) | npu(ms) | +|mobilenet_v1| 41.20| 12.76| 31.91| 4.07| 33.97| 3.20| +|mobilenet_v2| 29.57| 12.12| 22.47| 5.61| 23.17| 3.51| +|squeezenet| 23.96| 9.04| 17.79| 3.82| 18.65| 3.01| +|mnasnet| 26.47| 13.62| 19.54| 5.17| 20.34| 3.32| + + +- ddk 320 + +|模型 |990||990-5G|| +|---|---|---|---|---| +||cpu(ms) | npu(ms) |cpu(ms) | npu(ms) | +|ssd_mobilenetv1| 65.67| 18.21| 71.8| 16.6| + + +*说明:ssd_mobilenetv1的npu性能为npu、cpu混合调度运行的总时间* diff --git a/docs/user_guides/post_quant_no_data.md b/docs/user_guides/post_quant_no_data.md index 7443b4cac9d8de977ce6d52e6a61b8f78b7aaef4..fa9b42ad9bb5110609c9ffaec179286352c3a4f0 100644 --- a/docs/user_guides/post_quant_no_data.md +++ b/docs/user_guides/post_quant_no_data.md @@ -1,16 +1,20 @@ -# 模型量化-无校准数据训练后量化 +# 模型量化-动态离线量化 -本文首先简单介绍无校准数据训练后量化,然后说明产出量化模型,最后阐述量化模型预测。 +本文首先简单介绍动态离线量化,然后说明产出量化模型,最后阐述量化模型预测。 ## 1 简介 -无校准数据训练后量化,将模型中特定OP的权重从FP32类型量化成INT8/16类型,可以减小预测模型的大小。使用该量化模型预测,首先将INT8/16类型的权重反量化成FP32类型,然后再进行预测。 +动态离线量化,将模型中特定OP的权重从FP32类型量化成INT8/16类型。 + +该量化模型有两种预测方式:第一种是反量化预测方式,即是首先将INT8/16类型的权重反量化成FP32类型,然后再使用FP32浮运算运算进行预测;第二种量化预测方式,即是预测中动态计算量化OP输入的量化信息,基于量化的输入和权重进行INT8整形运算。 + +注意,目前PaddleLite仅仅支持第一种反量化预测方式。 使用条件: * 有训练好的预测模型 使用步骤: -* 产出量化模型:使用PaddlePaddle调用无校准数据训练后量化接口,产出量化模型 +* 产出量化模型:使用PaddlePaddle调用动态离线量化离线量化接口,产出量化模型 * 量化模型预测:使用PaddleLite加载量化模型进行预测推理 优点: @@ -18,11 +22,11 @@ * 权重量化成INT8类型,模型精度会受到影响,模型大小为原始的1/4 缺点: -* 只可以减小模型大小,不能加快模型推理 +* 目前只支持反量化预测方式,主要可以减小模型大小,对特定加载权重费时的模型可以起到一定加速效果 ## 2 产出量化模型 -因为目前该方法还没有在PaddleSlim中集成,大家可以使用PaddlePaddle调用无校准数据训练后量化接口,得到量化模型。 +目前该方法还没有在PaddleSlim中集成,大家可以使用PaddlePaddle调用动态离线量化接口,得到量化模型。 ### 2.1 安装PaddlePaddle @@ -32,9 +36,9 @@ 准备已经训练好的FP32预测模型,即 `save_inference_model()` 保存的模型。 -### 2.3 调用无校准数据训练后量化 +### 2.3 调用动态离线量化 -对于调用无校准数据训练后量化,首先给出一个例子。 +对于调用动态离线量化,首先给出一个例子。 ```python from paddle.fluid.contrib.slim.quantization import WeightQuantization @@ -52,7 +56,7 @@ weight_quant.quantize_weight_to_int(save_model_dir=save_model_dir, 执行完成后,可以在 `save_model_dir/quantized_model` 目录下得到量化模型。 -对于调用无校准数据训练后量化,以下对api接口进行详细介绍。 +对于调用动态离线量化,以下对api接口进行详细介绍。 ```python class WeightQuantization(model_dir, model_filename=None, params_filename=None) @@ -85,11 +89,11 @@ WeightQuantization.quantize_weight_to_int(self, ## 3 量化模型预测 -目前,对于无校准数据训练后量化产出的量化模型,只能使用PaddleLite进行预测部署。 +目前,对于动态离线量化产出的量化模型,只能使用PaddleLite进行预测部署。 很简单,首先使用PaddleLite提供的模型转换工具(opt)将量化模型转换成移动端预测的模型,然后加载转换后的模型进行预测部署。 -注意,PaddleLite 2.3版本才支持无校准数据训练后量化产出的量化,所以转换工具和预测库必须是2.3及之后的版本。 +注意,PaddleLite 2.3版本才支持动态离线量化产出的量化,所以转换工具和预测库必须是2.3及之后的版本。 ### 3.1 模型转换 diff --git a/docs/user_guides/post_quant_with_data.md b/docs/user_guides/post_quant_with_data.md index 11b33c06e31f7f6ab63970ef307d7741888445e3..a861a9e95aa2dc79573d79037695d4864bb3a7ba 100644 --- a/docs/user_guides/post_quant_with_data.md +++ b/docs/user_guides/post_quant_with_data.md @@ -1,17 +1,17 @@ -# 模型量化-有校准数据训练后量化 +# 模型量化-静态离线量化 ## 1 简介 -有校准数据训练后量化,使用少量校准数据计算量化因子,可以快速得到量化模型。使用该量化模型进行预测,可以减少计算量、降低计算内存、减小模型大小。 +静态离线量化,使用少量校准数据计算量化因子,可以快速得到量化模型。使用该量化模型进行预测,可以减少计算量、降低计算内存、减小模型大小。 -有校准数据训练后量化中,有两种计算量化因子的方法,非饱和量化方法和饱和量化方法。非饱和量化方法计算整个Tensor的绝对值最大值`abs_max`,将其映射为127。饱和量化方法使用KL散度计算一个合适的阈值`T` (`0 -#include -#include +#include // NOLINT +#include // NOLINT #include "lite/api/light_api.h" #include "lite/api/paddle_api.h" @@ -78,6 +78,14 @@ inline jfloatArray cpp_array_to_jfloatarray(JNIEnv *env, return result; } +inline jbyteArray cpp_array_to_jbytearray(JNIEnv *env, + const int8_t *buf, + int64_t len) { + jbyteArray result = env->NewByteArray(len); + env->SetByteArrayRegion(result, 0, len, buf); + return result; +} + inline jintArray cpp_array_to_jintarray(JNIEnv *env, const int *buf, int64_t len) { @@ -86,11 +94,11 @@ inline jintArray cpp_array_to_jintarray(JNIEnv *env, return result; } -inline jbyteArray cpp_array_to_jbytearray(JNIEnv *env, - const int8_t *buf, +inline jlongArray cpp_array_to_jlongarray(JNIEnv *env, + const int64_t *buf, int64_t len) { - jbyteArray result = env->NewByteArray(len); - env->SetByteArrayRegion(result, 0, len, buf); + jlongArray result = env->NewLongArray(len); + env->SetLongArrayRegion(result, 0, len, buf); return result; } diff --git a/lite/api/android/jni/native/tensor_jni.cc b/lite/api/android/jni/native/tensor_jni.cc index 5212fe9a6eba2b034883da93c9ea5d845a63c773..94e57393a99e3e16a2bc60834ab55e6fc2824db7 100644 --- a/lite/api/android/jni/native/tensor_jni.cc +++ b/lite/api/android/jni/native/tensor_jni.cc @@ -136,6 +136,22 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3I( return JNI_TRUE; } +JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3L( + JNIEnv *env, jobject jtensor, jlongArray buf) { + std::unique_ptr *tensor = get_writable_tensor_pointer(env, jtensor); + if (tensor == nullptr || (*tensor == nullptr)) { + return JNI_FALSE; + } + int64_t buf_size = (int64_t)env->GetArrayLength(buf); + if (buf_size != product((*tensor)->shape())) { + return JNI_FALSE; + } + + int64_t *input = (*tensor)->mutable_data(); + env->GetLongArrayRegion(buf, 0, buf_size, input); + return JNI_TRUE; +} + JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *env, jobject jtensor) { if (is_const_tensor(env, jtensor)) { @@ -178,6 +194,20 @@ Java_com_baidu_paddle_lite_Tensor_getIntData(JNIEnv *env, jobject jtensor) { } } +JNIEXPORT jlongArray JNICALL +Java_com_baidu_paddle_lite_Tensor_getLongData(JNIEnv *env, jobject jtensor) { + if (is_const_tensor(env, jtensor)) { + std::unique_ptr *tensor = + get_read_only_tensor_pointer(env, jtensor); + return cpp_array_to_jlongarray( + env, (*tensor)->data(), product((*tensor)->shape())); + } else { + std::unique_ptr *tensor = get_writable_tensor_pointer(env, jtensor); + return cpp_array_to_jlongarray( + env, (*tensor)->data(), product((*tensor)->shape())); + } +} + JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_deleteCppTensor( JNIEnv *env, jobject jtensor, jlong java_pointer) { if (java_pointer == 0) { diff --git a/lite/api/android/jni/native/tensor_jni.h b/lite/api/android/jni/native/tensor_jni.h index 9b029dfb4c7431354d5de20c6132236764c6cc66..c98171918b4040065fa637846c514e2232af6d38 100644 --- a/lite/api/android/jni/native/tensor_jni.h +++ b/lite/api/android/jni/native/tensor_jni.h @@ -57,6 +57,14 @@ Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *, jobject); JNIEXPORT jintArray JNICALL Java_com_baidu_paddle_lite_Tensor_getIntData(JNIEnv *, jobject); +/* + * Class: com_baidu_paddle_lite_Tensor + * Method: getLongData + * Signature: ()[L + */ +JNIEXPORT jlongArray JNICALL +Java_com_baidu_paddle_lite_Tensor_getLongData(JNIEnv *, jobject); + /* * Class: com_baidu_paddle_lite_Tensor * Method: nativeResize @@ -89,6 +97,14 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B( JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3I( JNIEnv *, jobject, jintArray); +/* + * Class: com_baidu_paddle_lite_Tensor + * Method: nativeSetData + * Signature: ([L)Z + */ +JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3L( + JNIEnv *, jobject, jlongArray); + /* * Class: com_baidu_paddle_lite_Tensor * Method: deleteCppTensor diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java b/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java index f76841dd413ddda86678eecf8241068dd98b74a4..c4e75993c537c1b14206b7be87c12d7109c8adeb 100644 --- a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java +++ b/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java @@ -141,6 +141,11 @@ public class Tensor { */ public native int[] getIntData(); + /** + * @return the tensor data as long array. + */ + public native long[] getLongData(); + private native boolean nativeResize(long[] dims); private native boolean nativeSetData(float[] buf); @@ -149,6 +154,8 @@ public class Tensor { private native boolean nativeSetData(int[] buf); + private native boolean nativeSetData(long[] buf); + /** * Delete C++ Tenor object pointed by the input pointer, which is presented by a * long value. diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc index f61ed9b4c38fcc3a6fe33fd26d6d3a80edcb9373..3cce247750341b37bf9aff07fce8ec54ee1428fe 100644 --- a/lite/api/model_test.cc +++ b/lite/api/model_test.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include #include #include @@ -25,6 +24,7 @@ #ifdef LITE_WITH_PROFILE #include "lite/core/profile/basic_profiler.h" #endif // LITE_WITH_PROFILE +#include using paddle::lite::profile::Timer; @@ -34,6 +34,10 @@ DEFINE_string(input_shape, DEFINE_bool(use_optimize_nb, false, "optimized & naive buffer model for mobile devices"); +DEFINE_string(backend, + "arm_cpu", + "choose backend for valid_places: arm_cpu | opencl. Compile " + "OpenCL version if you choose opencl"); DEFINE_string(arg_name, "", "the arg name"); namespace paddle { @@ -49,9 +53,19 @@ void OutputOptModel(const std::string& load_model_dir, Place{TARGET(kX86), PRECISION(kInt64)}, Place{TARGET(kHost), PRECISION(kFloat)}}); #else - config.set_valid_places({ - Place{TARGET(kARM), PRECISION(kFloat)}, - }); + if (FLAGS_backend == "opencl") { + config.set_valid_places({ + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, + Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, + TARGET(kARM), // enable kARM CPU kernel when no opencl kernel + }); + } else { // arm_cpu + config.set_valid_places({ + Place{TARGET(kARM), PRECISION(kFloat)}, + }); + } #endif auto predictor = lite_api::CreatePaddlePredictor(config); @@ -117,16 +131,40 @@ void Run(const std::vector>& input_shapes, << ", min time: " << ti.LapTimes().Min() << " ms" << ", max time: " << ti.LapTimes().Max() << " ms."; - auto output = predictor->GetOutput(0); - auto out = output->data(); - LOG(INFO) << "out " << out[0]; - LOG(INFO) << "out " << out[1]; - auto output_shape = output->shape(); - int output_num = 1; - for (int i = 0; i < output_shape.size(); ++i) { - output_num *= output_shape[i]; + // output summary + size_t output_tensor_num = predictor->GetOutputNames().size(); + LOG(INFO) << "output tensor num:" << output_tensor_num; + + for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) { + auto output_tensor = predictor->GetOutput(tidx); + LOG(INFO) << "============= output tensor " << tidx << " ============="; + auto tensor_shape = output_tensor->shape(); + std::string tensor_shape_str{""}; + int output_tensor_numel = 1; + for (int i = 0; i < tensor_shape.size(); ++i) { + output_tensor_numel *= tensor_shape[i]; + tensor_shape_str += std::to_string(tensor_shape[i]); + tensor_shape_str += (i < tensor_shape.size() - 1) ? "x" : ""; + } + auto out_data = output_tensor->data(); + auto out_mean = + paddle::lite::compute_mean(out_data, output_tensor_numel); + auto out_std_dev = paddle::lite::compute_standard_deviation( + out_data, output_tensor_numel, true, out_mean); + + LOG(INFO) << "output tensor " << tidx << " dims:" << tensor_shape_str; + LOG(INFO) << "output tensor " << tidx + << " elements num:" << output_tensor_numel; + LOG(INFO) << "output tensor " << tidx + << " standard deviation:" << out_std_dev; + LOG(INFO) << "output tensor " << tidx << " mean value:" << out_mean << "\n"; + + // print result + for (int i = 0; i < output_tensor_numel; ++i) { + VLOG(2) << "output_tensor->data()[" << i + << "]:" << output_tensor->data()[i]; + } } - LOG(INFO) << "output_num: " << output_num; // please turn off memory_optimize_pass to use this feature. if (FLAGS_arg_name != "") { @@ -162,6 +200,7 @@ int main(int argc, char** argv) { << "--model_dir /path/to/your/model"; exit(0); } + std::string save_optimized_model_dir = ""; if (FLAGS_use_optimize_nb) { save_optimized_model_dir = FLAGS_model_dir; diff --git a/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl b/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl index 360d8c753ef64b1da2ff2aeebddd94ff0f41db96..296eddffe762d4f88fb0df2731ef93f02bde9fb3 100644 --- a/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/grid_sampler_kernel.cl @@ -63,7 +63,10 @@ __kernel void grid_sampler(__read_only image2d_t input, if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ input3 = (CL_DTYPE4)(0.0); } - CL_DTYPE4 out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + CL_DTYPE4 out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) + + input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) + + input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) + + input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, outpoints, out_val); // y @@ -97,7 +100,10 @@ __kernel void grid_sampler(__read_only image2d_t input, input3 = (CL_DTYPE4)(0.0); } - out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) + + input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) + + input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) + + input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 1), out_val); // z @@ -130,7 +136,10 @@ __kernel void grid_sampler(__read_only image2d_t input, if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ input3 = (CL_DTYPE4)(0.0); } - out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) + + input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) + + input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) + + input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 2), out_val); // w @@ -163,6 +172,9 @@ __kernel void grid_sampler(__read_only image2d_t input, if (x0 + 1 < 0 || x0 + 1 > out_width - 1 || y0 + 1 < 0 || y0 + 1 > out_height - 1){ input3 = (CL_DTYPE4)(0.0); } - out_val = input0 * xe * ye + input1 * xs * ye + input2 * xe * ys + input3 * xs * ys; + out_val = input0 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ye) + + input1 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ye) + + input2 * (CL_DTYPE4)(xe) * (CL_DTYPE4)(ys) + + input3 * (CL_DTYPE4)(xs) * (CL_DTYPE4)(ys); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(outpoints.x, outpoints.y + 3), out_val); } diff --git a/lite/utils/io.h b/lite/utils/io.h index 506901bad5f75c5c1564f6340c7f687537de2e68..2141364df79bb189772592a556dd9a115ae1a67e 100644 --- a/lite/utils/io.h +++ b/lite/utils/io.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include "lite/utils/cp_logging.h"