Merge branch 'develop' into step_rnn/opt_ddim_lite

ef2616b6 · Liu Yiqun · 118c1441 · 800f5ce6 · ef2616b6 · ef2616b6
72 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -35,6 +35,7 @@
 build/
 build_fpga/
+docs/_build/
 .idea/

--- a/docs/README.md
+++ b/docs/README.md
+请参考[PaddleLite文档开发规范](http://agroup.baidu.com/paddle-infer/md/article/2561104)。
--- a/docs/advanced_user_guides/add_operation.md
+++ b/docs/advanced_user_guides/add_operation.md
+# 新增OP的方法
+以下以添加argmax为例，详细说明新增op的方法。
+## 1. 添加OpParam 结构体以传导 Op 的输入和输出
+- 这里命名为 `ArgmaxParam`
+- 在 `paddlelite/lite/operators/op_params.h` 中添加 `ArgmaxParam` 结构体，代码如下：
+    ```c++
+    struct ArgmaxParam {
+        lite::Tensor* X{};
+        lite::Tensor* Out{};
+        int Axis{0};
+    };
+    ```
+## 2. 添加 Argmax Op 并注册
+- 在paddlelite/lite/operators/目录下新建argmax_op.h文件，主要代码如下：
+    ```c++
+    class ArgmaxOpLite : public OpLite {
+    public:
+        ArgmaxOpLite() {}
+        explicit ArgmaxOpLite(const std::string &op_type) : OpLite(op_type) {}
+        bool CheckShape() const override;
+        bool InferShape() const override;
+        bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+        void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+        std::string DebugString() const override { return "argmax"; }
+    private:
+        mutable ArgmaxParam param_;
+    };
+    ```
+    `ArgmaxOpLite` 继承 `OpLite` ，成员变量包括 `ArgmaxParam` 结构体，需要实现的接口包括 `CheckShape()` 、`InferShape()` 、`AttachImp()` 、`AttachKernel()` 和 `DebugString()` 函数。`AttachKernel()` 和 `DebugString() `函数较为简单，此处直接实现；
+- 在 `paddlelite/lite/operators/` 目录下新建argmax_op.cc文件，需要具体实现`CheckShape()`、`InferShape()`和`AttachImp()`函数。`CheckShape()`函数检查输入是否符合要求，`InferShape()`函数基于输入推断得到输出的维度，`AttachImp()`函数绑定Op的输入输出。然后在argmax_op.cc文件中注册argmax，核心代码如下：
+    ```c++
+    bool ArgmaxOpLite::CheckShape() const {
+        CHECK_OR_FALSE(param_.X);
+        CHECK_OR_FALSE(param_.Out);
+        CHECK_OR_FALSE(param_.Axis < (param_.X)->dims().size());
+        return true;
+    }
+    bool ArgmaxOpLite::InferShape() const {
+        auto x_dims = param_.X->dims();
+        int x_rank = x_dims.size();
+        int axis = param_.Axis;
+        if (axis < 0) axis += x_rank;
+    std::vector<int64_t> out_dims;
+        for (int64_t i = 0; i < axis; i++) {
+            out_dims.push_back(x_dims[i]);
+        }
+        for (int64_t i = axis + 1; i < x_rank; i++) {
+            out_dims.push_back(x_dims[i]);
+        }
+      // Set output dims
+        param_.Out->Resize(lite::DDim(out_dims));
+        return true;
+    }
+    bool ArgmaxOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+        auto x = op_desc.Input("X").front();
+        auto out = op_desc.Output("Out").front();
+    param_.X = scope->FindVar(x)->GetMutable<lite::Tensor>();
+        param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+        param_.Axis = op_desc.GetAttr<int>("Axis");
+    return true;
+    }
+    REGISTER_LITE_OP(argmax, paddle::lite::operators::ArgmaxOpLite);
+    ```
+- 在paddlelite/lite/operators/CMakeLists.txt中添加```add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS})```
+## 3. 添加Argmax Kernel并绑定
+以下以arm端argmax实现为例说明
+- 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.h文件，声明ArgmaxCompute类，并继承KernelLite，主要代码如下：
+    ```c++
+    class ArgmaxCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+    public:
+        using param_t = operators::ArgmaxParam;
+        void Run() override;
+        virtual ~ArgmaxCompute() = default;
+    };
+    ```
+- 在paddlelite/lite/kernels/arm/目录下新建argmax_compute.cc文件，主要实现Run函数。`Run()`函数调用paddlelite/lite/bachends/arm/math/argmax.h中的`argmax_func()`函数，根据输入计算输出。最后在argmax_compute.cc文件中，我们绑定argmax的输入输出（为tensor的输入参数都需要绑定），代码如下：
+    ```c++
+    void ArgmaxCompute::Run() {
+        auto& param = Param<operators::ArgmaxParam>();
+        lite::Tensor* input = param.X;
+        lite::Tensor* output = param.Out;
+        int axis = param.Axis;
+        lite::arm::math::argmax_func(input, axis, output);
+        return;
+    }
+    REGISTER_LITE_KERNEL(
+        argmax, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ArgmaxCompute, def)
+        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+        .Finalize();
+    ```
+- 在paddlelite/lite/kernels/arm/CMakeLists.txt中添加
+    ```cmake
+    add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
+    ```
+## 4. 添加Argmax实现
+- 在paddlelite/lite/backends/arm/math/目录下新建argmax.h文件，声明`argmax_func()`函数，代码如下：
+    ```c++
+    void argmax_func(const lite::Tensor* input, const int axis, lite::Tensor* output);
+    ```
+- 在paddlelite/lite/backends/arm/math/目录下新建argmax.cc文件，具体实现`argmax_func()`函数，代码如下：
+    ```c++
+    void argmax_func(const lite::Tensor *input,
+                    const int axis,
+                    lite::Tensor *output) {
+    auto input_ddim = input->dims();
+    auto output_ddim = output->dims();
+    const int size = input_ddim[axis];
+    const int in_channel = input_ddim.count(axis, input_ddim.size());
+    const int out_channel = output_ddim.count(axis, output_ddim.size());
+    const int in_stride = input_ddim.count(axis + 1, input_ddim.size());
+    const int out_stride = input_ddim.count(0, axis);
+    for (int n = 0; n < out_stride; n++) {
+        for (int k = 0; k < in_stride; k++) {
+        const float *in_ptr = input->data<float>() + n * in_channel + k;
+        std::vector<std::pair<float, int>> vec;
+        vec.resize(size);
+        for (int i = 0; i < size; i++) {
+            vec[i] = std::make_pair(in_ptr[i * in_stride], i);
+        }
+        // sort
+        std::partial_sort(vec.begin(),
+                            vec.begin() + 1,
+                            vec.end(),
+                            std::greater<std::pair<float, int>>());
+        // out
+        float *out_ptr = output->mutable_data<float>() + n * out_channel + k;
+        *out_ptr = vec[0].second;
+        }
+    }
+    }
+    ```
+- 在paddlelite/lite/backends/arm/math/CMakeFile.txt中的```math_arm library```中添加argmax.cc，在paddlelite/lite/backends/arm/math/funcs.h中添加```#include "lite/arm/math/argmax.h"```
+## 5. 添加Argmax单测
+- 在paddlelite/lite/tests/kernels目录下新建argmax_compute_test.cc文件，声明并实现ArgmaxComputeTester类；
+- ArgmaxComputeTester类中主要包括PrepareOpDesc、PrepareData和RunBaseline函数。PrepareOpDesc函数设定单测op的类型和输入输出参数，PrepareData函数对输入tensor进行初始化，RunBaseline是基于输入计算得到输出，用于和框架计算的输出进行对比；
+- 使用gtest添加单测，代码如下：
+    ```c++
+    TEST(Argmax, precision) {
+        #ifdef LITE_WITH_ARM
+        LOG(INFO) << "test argmax arm";
+        Place place(TARGET(kARM));
+        for (int axis : {0, 1, 2, 3}) {
+            for (int n : {1, 3}) {
+            for (int c : {3, 6}) {
+                for (int h : {9, 18}) {
+                for (int w : {9, 18}) {
+                    std::unique_ptr<arena::TestCase> tester(
+                        new ArgmaxComputeTester(place, "def", axis, n, c, h, w));
+                    arena::Arena arena(std::move(tester), place, 2e-5);
+                    arena.TestPrecision();
+                }
+                }
+            }
+            }
+        }
+        #endif
+    }
+    ```
+- 在paddlelite/lite/tests/kernels/CMakeLists.txt中添加
+    ```cmake
+    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    ```
+## 6. 编译运行
+- 在paddlelite目录中，执行```./lite/tools/ci_build.sh build_test_arm```，该脚本会创建手机模拟器，并编译运行所有单测（花费时间较久）。如果运行无误，则表明添加argmax成功。
--- a/docs/advanced_user_guides/index.rst
+++ b/docs/advanced_user_guides/index.rst
--- a/docs/advanced_user_guides/model_quantization.md
+++ b/docs/advanced_user_guides/model_quantization.md
+# 模型量化
+本文主要介绍使用Paddle-Lite加载PaddlePaddle产出的量化模型，并进行推理执行。我们以MobileNetV1模型为示例，首先介绍准备量化模型，然后介绍部署执行。
+## 准备量化模型
+PaddlePaddle使用量化训练和训练后量化两种方法将FP32模型量化成Int8模型，下面分别介绍两种方法如何产出量化模型。
+### 量化训练
+目前，PaddlePaddle框架的量化训练主要针对卷积层（包括二维卷积和Depthwise卷积）、和全连接层，对应算子是conv2d、depthwise_conv2d和mul，更多量化训练的原理请参考[文档](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/docs/tutorial.md#1-quantization-aware-training%E9%87%8F%E5%8C%96%E4%BB%8B%E7%BB%8D)。Paddle-Lite支持运行PaddlePaddle框架量化训练产出的模型，可以进一步加快模型在移动端的执行速度。
+温馨提示：如果您是初次接触PaddlePaddle框架，建议首先学习[新人入门](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html)和[使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/index_cn.html)。
+您可以选择下载训练好的量化模型，或者使用PaddleSlim模型压缩工具训练得到量化模型。
+#### 下载量化模型
+官方发布了[MobileNetV1量化模型](https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip)，直接下载到本地。
+```bash
+wget https://paddle-inference-dist.bj.bcebos.com/int8%2Fpretrain%2Fmobilenet_v1_quant%2Ffloat.zip
+```
+#### 使用PaddleSlim模型压缩工具训练量化模型
+##### 安装PaddlePaddle
+根据操作系统、安装方式、Python版本和CUDA版本，按照[官方说明](https://paddlepaddle.org.cn/start)安装PaddlePaddle。例如：
+Ubuntu 16.04.4 LTS操作系统，CUDA9，cuDNN7，GPU版本安装:
+```bash
+pip install paddlepaddle-gpu==1.6.0.post97 -i https://mirrors.aliyun.com/pypi/simple/
+```
+Ubuntu 16.04.4 LTS操作系统，CPU版本安装:
+```bash
+pip install paddlepaddle==1.6.0 -i https://mirrors.aliyun.com/pypi/simple/
+```
+##### 克隆量化训练所需的代码库
+克隆[PaddlePaddle/models](https://github.com/PaddlePaddle/models)到本地，并进入models/PaddleSlim路径。
+```bash
+git clone https://github.com/PaddlePaddle/models.git
+cd models/PaddleSlim
+```
+##### 数据准备
+###### 训练数据准备
+参考[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification#data-preparation)中的数据准备教程，下载训练数据，并且保存到PaddleSlim/data路径下。
+###### 预训练模型准备
+参考/models/PaddleSlim/run.sh脚本， 从[models/PaddleCV/image_classification](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#supported-models-and-performances)下载MobileNetV1的预训练模型，并保存到PaddleSlim/pretrain路径下。
+经过以上三步，PaddleSlim目录下的文件结构如下所示：
+```bash
+.
+├── compress.py # 模型压缩任务主脚本，定义了压缩任务需要的模型相关信息
+├── configs # 压缩任务的配置文件，包括:蒸馏、int8量化量化、filter剪切和组合策略的配置文件
+├── data # 存放训练数据（需要用户自己创建）
+│   └── ILSVRC2012
+├── pretrain # 存放预训练模型参数，执行run.sh自动生成
+│   ├── MobileNetV1_pretrained
+│   ├── MobileNetV1_pretrained.tar
+│   ├── ResNet50_pretrained
+│   └── ResNet50_pretrained.tar
+├── docs # 文档目录
+├── light_nas
+├── models # 模型网络结构的定义，如MobileNetV1
+├── quant_low_level_api # 量化训练的底层API, 用于灵活定制量化训练的过程，适用于高阶用户
+├── reader.py # 定义数据处理逻辑
+├── README.md
+├── run.sh # 模型压缩任务启动脚本
+└── utility.py # 定义了常用的工具方法
+```
+##### 压缩脚本介绍
+在`compress.py`中定义了执行压缩任务需要的所有模型相关的信息，这里对几个关键的步骤进行简要介绍：
+###### 目标网络的定义
+compress.py的以下代码片段定义了train program, 这里train program只有前向计算操作。
+```python
+out = model.net(input=image, class_dim=args.class_dim)
+cost = fluid.layers.cross_entropy(input=out, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+```
+然后，通过clone方法得到eval_program, 用来在压缩过程中评估模型精度，如下：
+```python
+val_program = fluid.default_main_program().clone()
+```
+定义完目标网络结构，需要对其初始化，并根据需要加载预训练模型。
+###### 定义feed_list和fetch_list
+对于train program, 定义train_feed_list用于指定从train data reader中取的数据feed给哪些variable。定义train_fetch_list用于指定在训练时，需要在log中展示的结果。如果需要在训练过程中在log中打印accuracy信心，则将('acc_top1', acc_top1.name)添加到train_fetch_list中即可。
+```python
+train_feed_list = [('image', image.name), ('label', label.name)]
+train_fetch_list = [('loss', avg_cost.name)]
+```
+> 注意： 在train_fetch_list里必须有loss这一项。
+对于eval program. 同上定义eval_feed_list和train_fetch_list:
+```python
+val_feed_list = [('image', image.name), ('label', label.name)]
+val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5', acc_top5.name)]
+```
+###### Compressor和量化配置文件
+`compress.py`主要使用Compressor和yaml文件完成对模型的量化训练工作。Compressor类的定义如下：
+```python
+class Compressor(object):
+    def __init__(self,
+                 place,
+                 scope,
+                 train_program,
+                 train_reader=None,
+                 train_feed_list=None,
+                 train_fetch_list=None,
+                 eval_program=None,
+                 eval_reader=None,
+                 eval_feed_list=None,
+                 eval_fetch_list=None,
+                 teacher_programs=[],
+                 checkpoint_path='./checkpoints',
+                 train_optimizer=None,
+                 distiller_optimizer=None):
+```
+在定义Compressor对象时，需要注意以下问题：
+* train program如果带反向operators和优化更新相关的operators, 参数train_optimizer需要设置为None.
+* eval_program中parameter的名称需要与train_program中的parameter的名称完全一致。
+* 最终保存的量化模型是在eval_program网络基础上进行剪枝保存的。所以，如果用户希望最终保存的模型可以用于inference, 则eval program需要包含推理阶段需要的各种operators.
+* checkpoint保存的是float数据类型的模型。
+`configs/quantization.yaml`量化配置文件示例如下：
+```python
+version: 1.0
+strategies:
+    quantization_strategy:
+        class: 'QuantizationStrategy'
+        start_epoch: 0
+        end_epoch: 9
+        float_model_save_path: './output/float'
+        mobile_model_save_path: './output/mobile'
+        int8_model_save_path: './output/int8'
+        weight_bits: 8
+        activation_bits: 8
+        weight_quantize_type: 'abs_max'
+        activation_quantize_type: 'moving_average_abs_max'
+        save_in_nodes: ['image']
+        save_out_nodes: ['fc_0.tmp_2']
+compressor:
+    epoch: 10
+    checkpoint_path: './checkpoints_quan/'
+    strategies:
+        - quantization_strategy
+```
+其中，可配置参数包括：
+- **class:** 量化策略的类名称，目前仅支持`QuantizationStrategy`。
+- **start_epoch:** 在start_epoch开始之前，量化训练策略会往train_program和eval_program插入量化operators和反量化operators。 从start_epoch开始，进入量化训练阶段。
+- **end_epoch:** 在end_epoch结束之后，会保存用户指定格式的模型。注意：end_epoch之后并不会停止量化训练，而是继续训练直到epoch数等于compressor.epoch值为止。举例来说，当start_epoch=0，end_epoch=0，compressor.epoch=2时，量化训练开始于epoch0，结束于epoch1，但保存的模型是epoch0结束时的参数状态。
+- **float_model_save_path:**  保存float数据格式的模型路径，即该路径下的模型参数范围为int8范围但参数数据类型为float32。如果设置为None, 则不存储float格式的模型，默认为None。**注意：Paddle-Lite即使用该目录下的模型进行量化模型推理优化，详见本文[使用Paddle-Lite运行量化模型推理](#二使用Paddle-Lite运行量化模型推理)部分。**
+- **int8_model_save_path:** 保存int8数据格式的模型路径，即该路径下的模型参数范围为int8范围且参数数据类型为int8。如果设置为None, 则不存储int8格式的模型，默认为None.
+- **mobile_model_save_path:** 保存兼容paddle-mobile框架的模型路径。如果设置为None, 则不存储paddle-mobile格式的模型，默认为None。目前paddle-mobile已升级为Paddle-Lite。
+- **weight_bits:** 量化weight的bit数，注意偏置(bias)参数不会被量化。
+- **activation_bits:** 量化activation的bit数。
+-  **weight_quantize_type:** weight量化方式，目前量化训练支持`abs_max`、 `channel_wise_abs_max`。
+- **activation_quantize_type:** activation量化方式，目前量化训练支持`range_abs_max`、`moving_average_abs_max`。PaddlePaddle中还支持 `abs_max` 方法对激活进行量化，但是该方法动态计算输入的量化scale，这会增加计算量、减慢模型推理速度，所以lite不支持 `abs_max`激活量化方式。
+- **save_in_nodes:** variable名称列表。在保存量化后模型的时候，需要根据save_in_nodes对eval programg 网络进行前向遍历剪枝。默认为eval_feed_list内指定的variable的名称列表。
+- **save_out_nodes:** varibale名称列表。在保存量化后模型的时候，需要根据save_out_nodes对eval programg 网络进行回溯剪枝。默认为eval_fetch_list内指定的variable的名称列表。
+> **备注：**
+>
+> 1）`abs_max`意为在训练的每个step及inference阶段均动态计算量化scale值。`channel_wise_abs_max`与`abs_max`类似，不同点在于它会对卷积权重进行分channel求取量化scale。换言之，`abs_max`属于tensor-wise量化，而`channel_wise_abs_max`属于channel-wise量化，详细说明请猛戳[此处](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/quantization/training_quantization_model_format.md)。
+> 
+> 2）`moving_average_abs_max`和`range_abs_max`意为在训练阶段计算出一个静态的量化scale值，并将其用于inference阶段。`moving_average_abs_max`使用窗口滑动平均的方法计算量化scale，而`range_abs_max`则使用窗口绝对值最大值的方式。
+> 
+> 3）**目前，Paddle-Lite仅支持运行weight量化方式使用`abs_max`且activation量化方式使用`moving_average_abs_max`或`range_abs_max`产出的量化模型**。
+##### 执行int8量化训练
+修改run.sh，即注释掉`# enable GC strategy`与`# for sensitivity filter pruning`之间的内容并打开`#for quantization`相关的脚本命令（所需打开注释的命令如下所示）。
+```bash
+# for quantization
+#---------------------------
+export CUDA_VISIBLE_DEVICES=0
+python compress.py \
+--batch_size 64 \
+--model "MobileNet" \
+--pretrained_model ./pretrain/MobileNetV1_pretrained \
+--compress_config ./configs/quantization.yaml \
+--quant_only True
+```
+最后，运行`sh run.sh`命令开始int8量化训练。
+上述量化训练过程完成后，若按照本文中所述`configs/quantization.yaml`文件内容配置的模型输出路径，则可在models/PaddleSlim/output目录下看到`float`、`int8`和`mobile`三个目录，其中：
+* float目录: 参数范围为int8范围但参数数据类型为float32的量化模型。Paddle-Lite即使用该目录下的模型文件及参数进行量化模型的部署。
+* int8目录: 参数范围为int8范围且参数数据类型为int8的量化模型。
+* mobile目录：参数特点与int8目录相同且兼容paddle-mobile的量化模型（目前paddle-mobile已升级为Paddle-Lite）。
+### 训练后量化
+下面以MobileNetV1为例，介绍使用训练后量化方法产出量化模型。关于训练后量化的原理和详细使用方法，请参考[文档](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)。
+> 该示例的代码放在[models/PaddleSlim/quant_low_level_api/](https://github.com/PaddlePaddle/models/tree/develop/PaddleSlim/quant_low_level_api)目录下。如果需要执行该示例，首先clone下来[models](https://github.com/PaddlePaddle/models.git)，安装具有训练后量化功能的PaddlePaddle。因为目前Lite支持支持对conv2d、depthwise_conv2d和mul量化，所以修改[run_post_training_quanzation.sh](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/run_post_training_quanzation.sh) 脚本，设置is_full_quantize=False，然后执行该脚本；执行结束后，量化模型保存在`mobilenetv1_int8_model`目录下。下面介绍详细步骤。
+1）**准备模型和校准数据**
+安装PaddlePaddle的develop分支编译的whl包，准备已经训练好的FP32预测模型。
+准备校准数据，文件结构如下。val文件夹中有100张图片，val_list.txt文件中包含图片的label。
+```bash
+samples_100
+└──val
+└──val_list.txt
+```
+2）**配置校准数据生成器**
+MobileNetV1的输入是图片和标签，所以配置读取校准数据的sample_generator，每次返回一张图片和一个标签。详细代码在[models/PaddleSlim/reader.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/reader.py)。
+3）**调用训练后量化**
+调用训练后量化的核心代码如下，详细代码在[post_training_quantization.py](https://github.com/PaddlePaddle/models/blob/develop/PaddleSlim/quant_low_level_api/post_training_quantization.py)。
+``` python
+place = fluid.CUDAPlace(0) if args.use_gpu == "True" else fluid.CPUPlace()
+exe = fluid.Executor(place)
+sample_generator = reader.val(data_dir=args.data_path)
+ptq = PostTrainingQuantization(
+    executor=exe,
+    sample_generator=sample_generator,
+    model_dir=args.model_dir,
+    model_filename=args.model_filename,
+    params_filename=args.params_filename,
+    batch_size=args.batch_size,
+    batch_nums=args.batch_nums,
+    algo=args.algo,
+    is_full_quantize=args.is_full_quantize == "True")
+quantized_program = ptq.quantize()
+ptq.save_quantized_model(args.save_model_path)
+```
+## 使用Paddle-Lite运行量化模型推理
+#### 使用模型优化工具对量化模型进行优化
+接下来，使用原始的量化模型生成适合在移动端直接部署的模型。
+参考[源码编译](../source_compile)配置编译环境，确保可以编译成功。参考[模型转化方法](../model_optimize_tool)，首先编译model_optimize_tool工具，然后执行下面命令对量化训练的模型进行优化（注意，需要自行修改model_file、param_file和optimize_out）。
+```bash
+./model_optimize_tool                         \
+--model_file=mobilenet_v1_quant/float/model   \
+--param_file=mobilenet_v1_quant/float/weights \
+--optimize_out_type=naive_buffer              \
+--optimize_out=mobilenet_v1_quant_opt         \
+--valid_targets=arm                           \
+--prefer_int8_kernel=true
+```
+如前所述，量化训练后，float目录下的模型参数范围为int8，但参数数据类型仍为float32类型，这样确实没有起到模型参数压缩的效果。但是，经过model\_optimize\_tool工具优化后对应的量化参数均会以int8类型重新存储达到参数压缩的效果，且模型结构也被优化（如进行了各种operator fuse操作）。
+#### 在手机端准备量化模型文件
+使用如下命令将mobilenet_v1_quant_opt目录下的量化模型文件导入到手机端：
+```bash
+adb push mobilenet_v1_quant_opt /data/local/tmp
+```
+#### 使用mobilenetv1\_light\_api运行优化后的量化模型
+参考[源码编译](../source_compile)配置编译环境后，在Paddle-Lite执行如下命令获取轻量级API的demo：
+```bash
+cd /Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light
+make clean && make -j
+```
+执行完上述命令后，可在`Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/`路径下看到`mobilenetv1_light_api`可执行文件。将`mobilenetv1_light_api`导入到手机端并运行量化模型推理。执行命令如下：
+```bash
+adb push Paddle-Lite/build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/demo/cxx/mobile_light/mobilenetv1_light_api /data/local/tmp
+adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
+adb shell /data/local/tmp/mobilenetv1_light_api               \
+    --model_dir=/data/local/tmp/mobilenet_v1_quant_opt
+```
+**程序运行结果如下：**
+```bash
+Output dim: 1000
+Output[0]: 0.000228
+Output[100]: 0.000260
+Output[200]: 0.000250
+Output[300]: 0.000560
+Output[400]: 0.000950
+Output[500]: 0.000275
+Output[600]: 0.005143
+Output[700]: 0.002509
+Output[800]: 0.000538
+Output[900]: 0.000969
+```
+在C++中使用Paddle-Lite API的方法请猛戳[此处](../cpp_demo)，用户也可参考[mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc)的代码示例。
+### FAQ
+**问题**：Compiled with WITH_GPU, but no GPU found in runtime
+**解答**：检查本机是否支持GPU训练，如果不支持请使用CPU训练。如果在docker进行GPU训练，请使用nvidia_docker启动容器。
+**问题**：Inufficient GPU memory to allocation. at [/paddle/paddle/fluid/platform/gpu_info.cc:262]
+**解答**：正确设置run.sh脚本中`CUDA_VISIBLE_DEVICES`，确保显卡剩余内存大于需要内存。
--- a/docs/advanced_user_guides/support_operation_list.md
+++ b/docs/advanced_user_guides/support_operation_list.md
+# 支持OP列表
+## Ops
+- affine_channel
+- anchor_generator
+- arg_max
+- assign
+- assign_value
+- attention_padding_mask
+- axpy
+- batch_norm
+- beam_search
+- beam_search_decode
+- bilinear_interp
+- box_clip
+- box_coder
+- calib
+- calib_once
+- cast
+- collect_fpn_proposals
+- concat
+- conditional_block
+- conv2d
+- conv2d_transpose
+- crop
+- decode_bboxes
+- density_prior_box
+- depthwise_conv2d
+- distribute_fpn_proposals
+- dropout
+- elementwise_add
+- elementwise_div
+- elementwise_max
+- elementwise_mul
+- elementwise_sub
+- equal
+- exp
+- expand
+- fake_channel_wise_dequantize_max_abs
+- fake_dequantize_max_abs
+- fake_quantize_dequantize_moving_average_abs_max
+- fake_quantize_moving_average_abs_max
+- fake_quantize_range_abs_max
+- fc
+- feed
+- fetch
+- fill_constant
+- fill_constant_batch_size_like
+- flatten
+- flatten2
+- floor
+- fusion_elementwise_add_activation
+- fusion_elementwise_div_activation
+- fusion_elementwise_max_activation
+- fusion_elementwise_mul_activation
+- fusion_elementwise_sub_activation
+- gather
+- generate_proposals
+- graph_op
+- greater_equal
+- greater_than
+- gru
+- gru_unit
+- hard_sigmoid
+- im2sequence
+- increment
+- instance_norm
+- io_copy
+- io_copy_once
+- is_empty
+- layer_norm
+- layout
+- layout_once
+- leaky_relu
+- less_equal
+- less_than
+- lod_reset
+- log
+- logical_and
+- logical_not
+- logical_or
+- logical_xor
+- lookup_table
+- lookup_table_v2
+- lrn
+- match_matrix_tensor
+- matmul
+- mean
+- merge_lod_tensor
+- mul
+- multiclass_nms
+- nearest_interp
+- negative
+- norm
+- notequal
+- pad2d
+- pool2d
+- power
+- prelu
+- prior_box
+- range
+- read_from_array
+- reduce_max
+- reduce_mean
+- reduce_prod
+- reduce_sum
+- relu
+- relu6
+- relu_clipped
+- reshape
+- reshape2
+- roi_align
+- rsqrt
+- scale
+- search_aligned_mat_mul
+- search_attention_padding_mask
+- search_fc
+- search_grnn
+- search_group_padding
+- search_seq_arithmetic
+- search_seq_depadding
+- search_seq_fc
+- search_seq_softmax
+- sequence_arithmetic
+- sequence_concat
+- sequence_expand
+- sequence_expand_as
+- sequence_pool
+- sequence_reshape
+- sequence_reverse
+- sequence_softmax
+- sequence_topk_avg_pooling
+- shape
+- shuffle_channel
+- sigmoid
+- slice
+- softmax
+- softsign
+- split
+- split_lod_tensor
+- sqrt
+- square
+- squeeze
+- squeeze2
+- stack
+- swish
+- tanh
+- top_k
+- transpose
+- transpose2
+- uniform_random
+- unsqueeze
+- unsqueeze2
+- var_conv_2d
+- while
+- write_to_array
+- yolo_box
+## Kernels
+### Host kernels
+- feed
+- fetch
+- flatten
+- flatten2
+- multiclass_nms
+- reshape
+- reshape2
+### ARM kernels
+- affine_channel
+- anchor_generator
+- arg_max
+- assign
+- assign_value
+- axpy
+- batch_norm
+- beam_search
+- beam_search_decode
+- bilinear_interp
+- box_clip
+- box_coder
+- cast
+- collect_fpn_proposals
+- concat
+- conditional_block
+- conv2d
+- conv2d_transpose
+- crop
+- decode_bboxes
+- density_prior_box
+- depthwise_conv2d
+- distribute_fpn_proposals
+- dropout
+- elementwise_add
+- elementwise_div
+- elementwise_max
+- elementwise_mul
+- elementwise_sub
+- equal
+- exp
+- expand
+- fc
+- fill_constant
+- fill_constant_batch_size_like
+- floor
+- fusion_elementwise_add_activation
+- fusion_elementwise_div_activation
+- fusion_elementwise_max_activation
+- fusion_elementwise_mul_activation
+- fusion_elementwise_sub_activation
+- gather
+- generate_proposals
+- greater_equal
+- greater_than
+- gru
+- gru_unit
+- hard_sigmoid
+- im2sequence
+- increment
+- instance_norm
+- is_empty
+- layer_norm
+- layout
+- layout_once
+- leaky_relu
+- less_equal
+- less_than
+- lod_reset
+- log
+- logical_and
+- logical_not
+- logical_or
+- logical_xor
+- lookup_table
+- lookup_table_v2
+- lrn
+- matmul
+- merge_lod_tensor
+- mul
+- nearest_interp
+- negative
+- norm
+- not_equal
+- pad2d
+- pool2d
+- power
+- prelu
+- prior_box
+- range
+- read_from_array
+- reduce_max
+- reduce_mean
+- reduce_prod
+- relu
+- relu6
+- relu_clipped
+- roi_align
+- rsqrt
+- scale
+- sequence_expand
+- sequence_pool
+- sequence_softmax
+- shape
+- shuffle_channel
+- sigmoid
+- slice
+- softmax
+- split
+- split_lod_tensor
+- squeeze
+- squeeze2
+- stack
+- swish
+- tanh
+- top_k
+- transpose
+- transpose2
+- unsqueeze
+- unsqueeze2
+- while
+- write_to_array
+- yolo_box
+### X86 kernels
+- batch_norm
+- cast
+- concat
+- conv2d
+- depthwise_conv2d
+- dropout
+- elementwise_add
+- elementwise_sub
+- fc
+- fill_constant_batch_size_like
+- gather
+- gelu
+- gru
+- layer_norm
+- match_matrix_tensor
+- matmul
+- mul
+- pool2d
+- reduce_sum
+- relu
+- reshape
+- reshape2
+- scale
+- search_aligned_mat_mul
+- search_attention_padding_mask
+- search_fc
+- search_grnn
+- search_group_padding
+- search_seq_arithmetic
+- search_seq_depadding
+- search_seq_fc
+- search_seq_softmax
+- sequence_arithmetic
+- sequence_concat
+- sequence_expand_as
+- sequence_pool
+- sequence_reverse
+- sequence_topk_avg_pooling
+- shape
+- slice
+- softmax
+- softsign
+- square
+- squeeze
+- squeeze2
+- stack
+- tanh
+- transpose
+- transpose2
+- var_conv_2d
+### CUDA kernels
+- attention_padding_mask
+- bilinear_interp
+- calib
+- concat
+- conv
+- dropout
+- elementwise_add
+- fusion_elementwise_add_activation
+- fusion_elementwise_mul_activation
+- elementwise_mul
+- feed
+- io_copy
+- layout
+- layout_once
+- leaky_relu
+- lookup_table
+- match_matrix_tensor
+- mul
+- nearest_interp
+- pool2d
+- relu
+- scale
+- search_aligned_mat_mul
+- search_fc
+- search_grnn
+- search_group_padding
+- search_seq_depadding
+- search_seq_fc
+- sequence_arithmetic
+- sequence_concat
+- sequence_pool
+- sequence_reverse
+- sequence_topk_avg_pooling
+- softmax
+- transpose
+- var_conv_2d
+- yolo_box
+### OpenCL kernels
+- conv2d
+- depthwise_conv2d
+- elementwise_add
+- fc
+- fusion_elementwise_add_activation
+- layout
+- layout_once
+- io_copy
+- io_copy_once
+- mul
+- pool2d
+- relu
--- a/docs/api_reference/index.rst
+++ b/docs/api_reference/index.rst
--- a/docs/benchmark/benchmark.md
+++ b/docs/benchmark/benchmark.md
+# Benchmark 数据
+可以参考[benchmark_tools](benchmark_tools)，推荐**一键benchmark**。
+## 测试环境
+* 测试模型
+    * fp32模型
+        * mobilenet_v1
+        * mobilenet_v2
+        * squeezenet_v1.1
+        * mnasnet
+        * shufflenet_v2
+    * int8模型
+        * mobilenet_v1
+        * mobilenet_v2
+        * resnet50
+* 测试机器(android ndk ndk-r17c)
+   *  骁龙855
+      * xiaomi mi9, snapdragon 855 
+      * 4xA76(1@2.84GHz + 3@2.4GHz) + 4xA55@1.78GHz
+   *  骁龙845
+      * xiaomi mi8, 845
+      * 2.8GHz（大四核），1.7GHz（小四核）
+   *  骁龙835
+      * xiaomi mix2, snapdragon 835
+      * 2.45GHz（大四核），1.9GHz（小四核）
+   *  骁龙625
+      * oppo R9s, snapdragon625
+      * A53 x 8, big core@2.0GHz
+   * 骁龙653
+      * 360 N5, snapdragon 653
+      * 4 x A73@2.0GHz + 4 x A53@1.4GHz
+   * 麒麟970
+      * HUAWEI Mate10
+* 测试说明
+    * branch: release/2.0.0
+    * warmup=10, repeats=30，统计平均时间，单位是ms
+    * 当线程数为1时，```DeviceInfo::Global().SetRunMode```设置LITE_POWER_HIGH，否者设置LITE_POWER_NO_BIND
+    * 模型的输入图像的维度是{1, 3, 224, 224}，输入图像的每一位数值是1
+## 测试数据
+### fp32模型测试数据
+#### paddlepaddle model
+骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 
+mobilenet_v1 |32.19 |18.81 |10.90 |30.92 |18.31 |10.15 
+mobilenet_v2 |22.91 |13.75 |8.64 |21.15 |12.79 |7.84 
+shufflenet_v2 |4.67 |3.37 |2.65 |4.43 |3.15 |2.66 
+squeezenet_v1.1 |25.10 |15.93 |9.68 |23.28 |14.61 |8.71 
+mnasnet |21.84 |13.14 |7.96 |19.61 |11.88 |7.55
+骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 
+mobilenet_v1 |94.13 |52.17 |30.68 |88.28 |47.58 |26.64 
+mobilenet_v2 |61.24 |34.64 |22.36 |56.66 |32.19 |19.63 
+shufflenet_v2 |10.87 |6.92 |5.12 |10.41 |6.76 |4.97 
+squeezenet_v1.1 |73.61 |42.25 |24.44 |64.87 |38.43 |23.06 
+mnasnet |58.22 |33.43 |20.44 |53.43 |30.20 |18.09 
+麒麟980|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 
+mobilenet_v1 |55.11 |28.24 |13.27 |34.24 |17.74 |12.41 
+mobilenet_v2 |37.03 |19.80 |51.94 |23.64 |12.98 |9.38 
+shufflenet_v2 |7.26 |4.94 |15.06 |5.32 |3.33 |2.82 
+squeezenet_v1.1 |42.73 |23.66 |57.39 |26.03 |14.53 |13.66 
+mnasnet |36.87 |20.15 |46.04 |21.85 |12.06 |8.68 
+麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 
+mobilenet_v1 |97.80 |52.64 |34.46 |94.51 |49.36 |28.43 
+mobilenet_v2 |66.55 |38.52 |23.19 |62.89 |34.93 |21.53 
+shufflenet_v2 |13.78 |8.11 |5.93 |11.95 |7.90 |5.91 
+squeezenet_v1.1 |77.64 |43.67 |25.72 |69.91 |40.66 |24.62 
+mnasnet |61.86 |34.62 |22.68 |59.61 |32.79 |19.56 
+#### caffe model
+骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |32.42 |18.68 |10.86 |30.92 |18.35 |10.07 |
+mobilenet_v2 |29.53 |17.76 |10.89 |27.19 |16.53 |9.75 |
+shufflenet_v2 |4.61 |3.29 |2.61 |4.36 |3.11 |2.51 |
+骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |92.52 |52.34 |30.37 |88.31 |49.75 |27.29 |
+mobilenet_v2 |79.50 |45.67 |28.79 |76.13 |44.01 |26.13 |
+shufflenet_v2 |10.94 |7.08 |5.16 |10.64 |6.83 |5.01 |
+麒麟980|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |55.36 |28.18 |13.31 |34.42 |17.93 |12.52 |
+mobilenet_v2 |49.17 |26.10 |65.49 |30.50 |16.66 |11.72 |
+shufflenet_v2 |8.45 |5.00 |15.65 |4.58 |3.14 |2.83 |
+麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |97.85 |53.38 |33.85 |94.29 |49.42 |28.29 |
+mobilenet_v2 |87.40 |50.25 |31.85 |85.55 |48.11 |28.24 |
+shufflenet_v2 |12.16 |8.39 |6.21 |12.21 |8.33 |6.32 |
+#### int8量化模型测试数据
+骁龙855|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |36.80 |21.58 |11.12 | 14.01 |8.13 |4.32 |
+mobilenet_v2 |28.72 |19.08 |12.49 | 17.24 |11.55 |7.82 |
+骁龙835|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |60.76 |32.25 |16.66 |56.57 |29.84 |15.24 |
+mobilenet_v2 |49.38 |31.10 |22.07 |47.52 |28.18 |19.24 |
+麒麟970|armv7 | armv7 |  armv7 |armv8 | armv8 |armv8 
+----| ---- | ---- | ---- | ----  |----  |----
+threads num|1 |2 |4 |1 |2 |4 |
+mobilenet_v1 |65.95 |34.39 |18.68 |60.86 |30.98 |16.31 |
+mobilenet_v2 |68.87 |39.39 |24.43 |65.57 |37.31 |20.87 |
--- a/docs/benchmark/benchmark_tools.md
+++ b/docs/benchmark/benchmark_tools.md
+# Benchmark 测试方法
+本文将会介绍，在**Ubuntu:16.04交叉编译环境**下，用安卓手机在终端测试Paddle-Lite的性能，并介绍两种Benchmark方法：
+1. **一键Benchmark**：适用于想快速获得常见模型性能的用户，下载预编译好的benchmark可执行文件；
+2. **逐步Benchmark**：将**一键Benchmark**流程拆解讲解。
+## 环境准备
+1. 准备[adb](https://developer.android.com/studio/command-line/adb)等必备软件：
+```shell
+sudo apt update
+sudo apt install -y wget adb
+```
+2. 检查手机与电脑连接。安卓手机USB连上电脑，打开设置 -> 开启开发者模式 -> 开启USB调试 -> 允许（授权）当前电脑调试手机；
+3. 在电脑终端输入`adb devices`命令，查看当前连接到的设备：
+```shell
+adb devices
+```
+命令成功执行，显示结果类似下面（序列码略有不同）：
+```shell
+List of devices attached
+712QSDSEMMS7C   device
+```
+## 一. 一键Benchmark
+执行以下命令，完成Benchmark：
+```shell
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/run_benchmark.sh
+sh run_benchmark.sh
+```
+该`run_benchmark.sh`脚本会：
+1. 下载模型，并上传手机：包含mobilenetv1/v2、shufflenetv2、squeezenetv1.1、mnasnet；
+2. 下载pre-built android-armv7和android-armv8的可执行文件，并上传手机：`benchmark_bin_v7`和`benchmark_bin_v8`；
+3. 自动执行另一个脚本`benchmark.sh`（多台手机连接USB，请在`benchmark.sh`脚本中对`adb`命令后加上测试手机的`serial number`）；
+4. 从手机下载benchmark结果`result_armv7.txt`和`result_armv8.txt`，到当前目录，并显示Benchmark结果。
+## 二. 逐步Benchmark
+### 1. 获取benchmark可执行文件
+benchmark_bin文件可以测试PaddleLite的性能，有下面两种方式获得。
+#### 方式一：下载benchmark_bin可执行文件
+```shell
+# Download benchmark_bin for android-armv7
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v7
+# Download benchmark_bin for android-armv8
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_bin_v8
+```
+#### 方式二：由源码编译benchmark_bin文件
+根据[源码编译](../source_compile)准备编译环境，拉取PaddleLite最新release发布版代码，并在仓库根目录下，执行：
+```shell
+###########################################
+# Build benchmark_bin for android-armv7   #
+###########################################
+./lite/tools/ci_build.sh  \
+  --arm_os="android" \
+  --arm_abi="armv7" \
+  --arm_lang="gcc " \
+  build_arm
+# `benchmark_bin` 在: <paddle-lite-repo>/build.lite.android.armv7.gcc/lite/api/benchmark_bin
+###########################################
+# Build benchmark_bin for android-armv8   #
+###########################################
+./lite/tools/ci_build.sh  \
+  --arm_os="android" \
+  --arm_abi="armv8" \
+  --arm_lang="gcc "  \
+  build_arm
+# `benchmark_bin` 在: <paddle-lite-repo>/build.lite.android.armv8.gcc/lite/api/benchmark_bin
+```
+> **注意**：为了避免在docker内部访问不到手机的问题，建议编译得到benchmark_bin后退出到docker外面，并且将benchmark_bin文件拷贝到一个临时目录。然后在该临时目录下，按照下面步骤下载模型、拷贝脚本、测试。
+### 2. 准备模型
+PaddleLite为Benchmark准备好了[常见Benchmark模型](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_models.tgz)。
+执行以下命令，下载常见Benchmark模型并解压：
+```shell
+wget -c https://paddle-inference-dist.bj.bcebos.com/PaddleLite/benchmark_0/benchmark_models.tgz
+tar zxvf benchmark_models.tgz
+```
+如果测试其他模型，请将模型文件放到 `benchmark_models` 文件夹中。
+### 3. benchmark.sh脚本
+benchmark测试的执行脚本`benchmark.sh` 位于源码中的`/PaddleLite/lite/tools/benchmark.sh`位置，测试时需要将`benchmark.sh`、 `benchmark_bin` 、 `benchmark_models` 文件复制到同一目录下。
+### 4. 测试
+从终端进入benchmark.sh、可执行文件（benchmark_bin_v7、benchmark_bin_v8）和模型文件（benchmark_models）所在文件夹。
+如果 `benchmark_models` 中所有模型文件都已经使用 `model_optimize_tool` 进行转换，则使用 benchmark.sh 脚本执行如下命令进行测试：
+```shell
+# Benchmark for android-armv7
+sh benchmark.sh ./benchmark_bin_v7 ./benchmark_models result_armv7.txt
+# Benchmark for android-armv8
+sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt
+```
+如果 `benchmark_models` 中所有模型文件都没有使用 `model_optimize_tool` 进行转换，则执行下面的命令。`benchmark_bin` 会首先转换模型，然后加载模型进行测试。
+```shell
+# Benchmark for android-armv7
+sh benchmark.sh ./benchmark_bin_v7 ./benchmark_models result_armv7.txt true
+# Benchmark for android-armv8
+sh benchmark.sh ./benchmark_bin_v8 ./benchmark_models result_armv8.txt true
+```
+测试结束后，armv7和armv8的结果，分别保存在当前目录下的`result_armv7.txt`和`result_armv8.txt`文件中。
+**查看测试结果**
+在当前目录的`result_armv7.txt`和`result_armv8.txt`文件，查看测试结果。
+> 不同手机，不同版本，测试模型的性能数据不同。
+```shell
+run benchmark armv7
+--------------------------------------
+PaddleLite Benchmark
+Threads=1 Warmup=10 Repeats=30
+-- mnasnet               avg = 159.8427 ms
+-- mobilenet_v1          avg = 235.0072 ms
+-- mobilenet_v2          avg = 173.0387 ms
+-- shufflenet_v2         avg = 76.0040 ms
+-- squeezenet_v11        avg = 164.2957 ms
+Threads=2 Warmup=10 Repeats=30
+-- mnasnet               avg = 83.1287 ms
+-- mobilenet_v1          avg = 121.6029 ms
+-- mobilenet_v2          avg = 86.6175 ms
+-- shufflenet_v2         avg = 41.5761 ms
+-- squeezenet_v11        avg = 87.8678 ms
+Threads=4 Warmup=10 Repeats=30
+-- mnasnet               avg = 73.3880 ms
+-- mobilenet_v1          avg = 119.0739 ms
+-- mobilenet_v2          avg = 85.3050 ms
+-- shufflenet_v2         avg = 38.0762 ms
+-- squeezenet_v11        avg = 64.2201 ms
+--------------------------------------
+run benchmark armv8
+--------------------------------------
+PaddleLite Benchmark
+Threads=1 Warmup=10 Repeats=30
+-- mnasnet               avg = 165.3073 ms
+-- mobilenet_v1          avg = 306.0188 ms
+-- mobilenet_v2          avg = 195.1884 ms
+-- shufflenet_v2         avg = 99.3692 ms
+-- squeezenet_v11        avg = 156.6971 ms
+Threads=2 Warmup=10 Repeats=30
+-- mnasnet               avg = 90.2290 ms
+-- mobilenet_v1          avg = 157.0007 ms
+-- mobilenet_v2          avg = 118.1607 ms
+-- shufflenet_v2         avg = 68.6804 ms
+-- squeezenet_v11        avg = 91.3090 ms
+Threads=4 Warmup=10 Repeats=30
+-- mnasnet               avg = 179.9730 ms
+-- mobilenet_v1          avg = 204.0684 ms
+-- mobilenet_v2          avg = 181.6486 ms
+-- shufflenet_v2         avg = 123.2728 ms
+-- squeezenet_v11        avg = 412.9046 ms
+--------------------------------------
+```
--- a/docs/benchmark/index.rst
+++ b/docs/benchmark/index.rst
--- a/docs/conf.py
+++ b/docs/conf.py
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+# -- Path setup --------------------------------------------------------------
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+#sys.path.insert(0, os.path.abspath('.'))
+import sphinx_rtd_theme
+from recommonmark.parser import CommonMarkParser
+from recommonmark.transform import AutoStructify
+# -- Project information -----------------------------------------------------
+project = u'Paddle-Lite'
+copyright = u'2020, Paddle-Lite Developer'
+author = u'Paddle-Lite Developer'
+# The short X.Y version
+version = u'latest'
+# The full version, including alpha/beta/rc tags
+release = u''
+# -- General configuration ---------------------------------------------------
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['recommonmark', 'sphinx_markdown_tables']
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = ['.rst', '.md']
+# The master toctree document.
+master_doc = 'index'
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+# -- Options for HTML output -------------------------------------------------
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+# -- Options for HTMLHelp output ---------------------------------------------
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Paddle-Litedoc'
+# -- Options for LaTeX output ------------------------------------------------
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'Paddle-Lite.tex', u'Paddle-Lite Documentation',
+     u'Paddle-Lite Developer', 'manual'),
+]
+# -- Options for manual page output ------------------------------------------
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'paddle-lite', u'Paddle-Lite Documentation',
+     [author], 1)
+]
+# -- Options for Texinfo output ----------------------------------------------
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Paddle-Lite', u'Paddle-Lite Documentation',
+     author, 'Paddle-Lite', 'One line description of project.',
+     'Miscellaneous'),
+]
+# -- Options for Epub output -------------------------------------------------
+# Bibliographic Dublin Core info.
+epub_title = project
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+# A unique identification for the text.
+#
+# epub_uid = ''
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
--- a/docs/develop_guides/index.rst
+++ b/docs/develop_guides/index.rst
--- a/docs/images/architecture.png
+++ b/docs/images/architecture.png
--- a/docs/index.rst
+++ b/docs/index.rst
+.. Paddle-Lite documentation master file, created by
+   sphinx-quickstart on Thu Feb  6 14:11:30 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+Welcome to Paddle-Lite's documentation!
+=======================================
+.. toctree::
+  :maxdepth: 1
+  :caption: 简介
+  :name: sec-introduction
+  introduction/tech_highlights
+  introduction/architecture
+.. toctree::
+  :maxdepth: 1
+  :caption: Benchmark数据和方法
+  :name: sec-benchmark
+  benchmark/benchmark
+  benchmark/benchmark_tools
+.. toctree::
+  :maxdepth: 1
+  :caption: 安装
+  :name: sec-install
+  installation/source_compile
+.. toctree::
+  :maxdepth: 1
+  :caption: 使用指南
+  :name: sec-user-guides
+.. toctree::
+  :maxdepth: 1
+  :caption: 进阶使用指南
+  advanced_user_guides/support_operation_list
+  advanced_user_guides/add_operation
+  advanced_user_guides/model_quantization
+.. toctree::
+  :maxdepth: 1
+  :caption: 开发者文档
+.. toctree::
+  :maxdepth: 1
+  :caption: API文档
+.. toctree::
+  :maxdepth: 1
+  :caption: FAQ
+.. toctree::
+  :maxdepth: 1
+  :caption: paddle-mobile
--- a/docs/installation/source_compile.md
+++ b/docs/installation/source_compile.md
+# 源码编译
+目前支持三种编译的环境：
+1. Docker 容器环境，
+2. Linux（推荐 Ubuntu 16.04）环境，
+3. Mac OS 环境。
--- a/docs/introduction/architecture.md
+++ b/docs/introduction/architecture.md
+# 架构设计
+Mobile 在这次升级为 Lite 架构， 侧重多硬件、高性能的支持，其主要设计思想如下
+- 引入 Type system，强化多硬件、量化方法、data layout 的混合调度能力
+- 硬件细节隔离，通过不同编译开关，对支持的任何硬件可以自由插拔
+- 引入 MIR(Machine IR) 的概念，强化带执行环境下的优化支持
+- 优化期和执行期严格隔离，保证预测时轻量和高效率
+架构图如下
+![Paddle Inference Refactor1.0](https://user-images.githubusercontent.com/52520497/64949619-26e49580-d8ac-11e9-855a-514feb9b75af.png)
+## 编译期和执行期严格隔离设计
+- compile time 优化完毕可以将优化信息存储到模型中；execution time 载入并执行
+- 两套 API 及对应的预测lib，满足不同场景
+  - `CxxPredictor` 打包了 `Compile Time` 和 `Execution Time`，可以 runtime 在具体硬件上做分析和优化，得到最优效果
+  - `MobilePredictor` 只打包 `Execution Time`，保持部署和执行的轻量
+## `Execution Time` 轻量级设计和实现
+- 每个 batch 实际执行只包含两个步骤执行
+  - `Op.InferShape`
+  - `Kernel.Run`，Kernel 相关参数均使用指针提前确定，后续无查找或传参消耗
+  - 设计目标，执行时，只有 kernel 计算本身消耗
+- 轻量级 `Op` 及 `Kernel` 设计，避免框架额外消耗
+  - `Op` 只有 `CreateKernels` 和 `InferShape` 两个重要职能
+  - `Kernel` 只有 `Run` 职能
+## 多硬件后端支持
+- 硬件通用行为，使用 `TargetWrapper` 模块做适配器适配，对上层框架提供一致界面
+- 框架上层策略保持硬件无关，如存储优化 (Memory optimize)，计算剪枝 (Computation prune) 等，任何硬件接入均可直接复用
+- 框架支持了硬件通用行为，特定硬件细节不做过多约束，各硬件可以自行实现并接入框架
+- 计算模式上目前支持两种主流模型，一种是类似 X86, ARM CPU 等非异构设备；一种是 GPU，或 FPGA 等异构设备（支持 stream, event异步执行模式以及跨设备拷贝）
+---
+## 多硬件及算法混合调度支持
+`TensorTy` 用来表示 Tensor 类型
+```c++
+struct TensorTy {
+    TargetType target;
+    PrecisionType precision;
+    DataLayout layout;
+    int deviceid;
+};
+```
+```c++
+enum class TargetType { kARM, kX86, kCUDA, kOpenCL };
+enum class PrecisionType { kFP32, kFP16, kInt8, kInt16 };
+enum class DataLayout { kNCHW, kNHWC };
+```
+---
+注册 Kernel，确定特定 Kernel 的输入输出特征
+```c++
+REGISTER_LITE_KERNEL(
+  mul, kARM, kFloat, kNCHW, arm::MulCompute, def)
+  .BindInput("X", {LiteType::GetTensorTy(kARM, kFloat, kNCHW)})
+  .BindInput("Y", {LiteType::GetTensorTy(kARM, kFloat, kNCHW))})
+  .BindOutput("Out", {LiteType::GetTensorTy(kARM, kFloat, kNCHW)})
+  .Finalize();
+```
+---
+同一个 Op 的不同 Kernel 类似函数重载
+用于支持任意的混合调度：
+1. 标记模型中所有 tensor 的 Type
+2. 标记 Kernel 的 硬件、执行精度、data layout 等信息
+全局做类型推断，当发现 tensor 传递中有类型冲突，采用 type cast 操作，通过插入特定功能 Op 来实现正确的传导
+![lite-7](https://user-images.githubusercontent.com/52520497/64949642-395ecf00-d8ac-11e9-8b69-ced1996abc3b.png)
+---
+## MIR 用于图分析优化
+基于 Type System 的 SSA，通过 IR Pass 对计算图进行分析和优化：
+- 支持对整个 graph 进行类型推断，发现类型冲突并加入 type cast op，来支持通用混合调度
+- 计算剪枝 (Compute prune)，比如去掉 scale(1), assign op 等
+- 存储优化 (Memory optimize)
+- 操作熔合 (Operator fuse)（已经支持 fc, conv_bn, ele_add+act 等6种 fuse 策略）
+- 支持量化处理（已支持 Int8预测）
--- a/docs/introduction/tech_highlights.md
+++ b/docs/introduction/tech_highlights.md
+# 技术特点
+不同于普通的移动端预测基于类 Caffe 的架构，Lite 架构最早的设计目标来源于 Paddle Server 和 Mobile 两种场景的要求，其中 Server 端需要有完善的图分析和优化能力，而 Mobile 端要求有轻量级部署的能力，两种场景共同的要求是高性能，多硬件支持等。
+基于上述要求，Lite 架构完整实现了相应的能力，重点描述如下。
+## 多硬件支持
+Lite 架构已经验证和完整支持从 Mobile 到 Server 多种硬件的支持需求，包括 ARM CPU, ARM GPU, Huawei NPU, Intel X86 CPU, NV GPU 等。 得益于对不同硬件适度的抽象，在Lite 框架本身清晰的同时支持不同硬件的特殊调度需求，使得Lite架构在框架的清晰程度和硬件的特定调度优化上达到很好的平衡，比如 Nvidia GPU 上复杂的 stream, event 分配，在 Lite 中可以清晰表示。
+多种硬件的 Kernel 在代码层和执行层均互不干扰，用户可以自由插拔任何硬件的支持。
+## 高性能
+高性能来源于两方面，一是 Kernel 优化；二是框架执行。
+Kernel 方面，我们对相应硬件上的 Kernel 通过指令集、操作熔合、算法改写等方式进行了深入优化。
+框架执行方面，通过简化 Op 和 Kernel 的功能，使得执行期的框架开销极低；此外，框架极大的灵活性可以支持各种硬件的特定调度优化以提升整体效率。
+## 量化支持
+Lite 支持Paddle Slim 强大的量化训练完毕的模型，因此完整保留了量化计算的高性能以及量化训练的高精度。
+## 强大的图分析和优化能力
+在图分析优化上，不同于常规的移动端预测引擎基于 Python 脚本工具转化模型， Lite 架构上有完整基于 C++ 开发的 IR 及相应 Pass 集合，以支持操作熔合 (Operator fusion)，计算剪枝 (Computation pruning)，存储优化 (Memory optimization)，量化计算 (Quantitative computation)  等多类计算图优化。
+更多的优化策略可以简单通过添加 Pass 的方式模块化支持。 
+## 轻量级部署
+尽管图优化上有复杂的策略，但并不影响移动端的轻量级部署，图分析模块和最终的执行引擎可以拆开使用，最终部署只有一层薄薄的 Kernel 。
+## 可支持任意硬件的混合调度
+Lite 支持系统可见任意硬件的混合调度，目前已经支持 ARM CPU  和  ARM GPU 的 Kernel 自动混合调度，并验证了 X86 CPU 和 Nvidia GPU 间的混合调度。
+支持混合调度的考量有两点：
+1. 当系统内同时存在多种硬件可用时，混合调度可以充分利用各类硬件资源
+2. 随着支持模型的增多，各硬件对kernel的支持丰富度不一，难免需要混合调度才能跑通
+Lite架构通过从底层支持 `Type system`  的方式通用建模各类混合执行的行为，从而能够相对完备地支持混调。
--- a/docs/make.bat
+++ b/docs/make.bat
+@ECHO OFF
+pushd %~dp0
+REM Command file for Sphinx documentation
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+if "%1" == "" goto help
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+:end
+popd
--- a/docs/paddle_mobile/index.rst
+++ b/docs/paddle_mobile/index.rst
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
+sphinx
+recommonmark
+sphinx_markdown_tables
+sphinx_rtd_theme
--- a/docs/user_guides/index.rst
+++ b/docs/user_guides/index.rst
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -41,6 +41,8 @@ void LightPredictor::Build(const std::string& model_dir,
    default:
      LOG(FATAL) << "Unknown model type";
  }
+  DequantizeWeight();
  BuildRuntimeProgram(cpp_program_desc_);
  PrepareFeedFetch();
 }
@@ -144,5 +146,69 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
  program_->set_exec_scope(program.exec_scope());
 }
+void LightPredictor::DequantizeWeight() {
+#define PROCESS_CONV2D_DATA()                                   \
+  for (int64_t i = 0; i < h; ++i) {                             \
+    for (int64_t j = 0; j < w; ++j) {                           \
+      fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \
+    }                                                           \
+  }
+#define PROCESS_FC_DATA()                           \
+  for (int i = 0; i < input_tensor->numel(); i++) { \
+    *fp_data = scale_list[0] * (*int_data);         \
+    ++fp_data;                                      \
+    ++int_data;                                     \
+  }
+  Tensor tmp_tensor;
+  CHECK(cpp_program_desc_.BlocksSize());
+  auto* main_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
+  for (size_t k = 0; k < main_block->OpsSize(); ++k) {
+    auto* op_desc = main_block->GetOp<cpp::OpDesc>(k);
+    if (op_desc->HasAttr("quantize_weight_bits")) {  //  weight quantized op
+      auto input_names = op_desc->input_vars();
+      for (auto& input_name : input_names) {
+        std::string input_scale_name = input_name + "_quant_scale";
+        if (op_desc->HasAttr(input_scale_name)) {  // the input is quantized
+          auto input_tensor =
+              scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
+          tmp_tensor.CopyDataFrom(*input_tensor);
+          auto scale_list =
+              op_desc->GetAttr<std::vector<float>>(input_scale_name);
+          int quantize_weight_bits =
+              op_desc->GetAttr<int>("quantize_weight_bits");
+          float* fp_data = input_tensor->mutable_data<float>();
+          std::string op_type = op_desc->Type();
+          if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
+            int64_t h = input_tensor->dims()[0];
+            int64_t w = input_tensor->numel() / h;
+            CHECK_EQ(scale_list.size(), h);
+            if (quantize_weight_bits == 8) {
+              const int8_t* int_data = tmp_tensor.data<int8_t>();
+              PROCESS_CONV2D_DATA()
+            } else {
+              const int16_t* int_data = tmp_tensor.data<int16_t>();
+              PROCESS_CONV2D_DATA()
+            }
+          } else if (op_type == "fc" || op_type == "mul") {
+            if (quantize_weight_bits == 8) {
+              const int8_t* int_data = tmp_tensor.data<int8_t>();
+              PROCESS_FC_DATA()
+            } else {
+              const int16_t* int_data = tmp_tensor.data<int16_t>();
+              PROCESS_FC_DATA()
+            }
+          }
+        }
+      }
+    }
+  }
+#undef PROCESS_CONV2D_DATA
+#undef PROCESS_FC_DATA
+}
 }  // namespace lite
 }  // namespace paddle
--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -78,6 +78,8 @@ class LITE_API LightPredictor {
  void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
+  void DequantizeWeight();
 private:
  std::shared_ptr<Scope> scope_;
  std::unique_ptr<RuntimeProgram> program_;

--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -116,6 +116,34 @@ static size_t PrecisionTypeLength(PrecisionType type) {
  }
 }
+template <typename T>
+struct PrecisionTypeTrait {
+  constexpr static PrecisionType Type() { return PrecisionType::kUnk; }
+};
+#define _ForEachPrecisionTypeHelper(callback, cpp_type, precision_type) \
+  callback(cpp_type, ::paddle::lite_api::PrecisionType::precision_type);
+#define _ForEachPrecisionType(callback)                   \
+  _ForEachPrecisionTypeHelper(callback, bool, kBool);     \
+  _ForEachPrecisionTypeHelper(callback, float, kFloat);   \
+  _ForEachPrecisionTypeHelper(callback, int8_t, kInt8);   \
+  _ForEachPrecisionTypeHelper(callback, int16_t, kInt16); \
+  _ForEachPrecisionTypeHelper(callback, int, kInt32);     \
+  _ForEachPrecisionTypeHelper(callback, int64_t, kInt64);
+#define DefinePrecisionTypeTrait(cpp_type, precision_type)           \
+  template <>                                                        \
+  struct PrecisionTypeTrait<cpp_type> {                              \
+    constexpr static PrecisionType Type() { return precision_type; } \
+  }
+_ForEachPrecisionType(DefinePrecisionTypeTrait);
+#undef _ForEachPrecisionTypeHelper
+#undef _ForEachPrecisionType
+#undef DefinePrecisionTypeTrait
 #define TARGET(item__) paddle::lite_api::TargetType::item__
 #define PRECISION(item__) paddle::lite_api::PrecisionType::item__
 #define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -44,3 +44,4 @@ USE_MIR_PASS(memory_optimize_pass);
 USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
 USE_MIR_PASS(npu_subgraph_pass);
 USE_MIR_PASS(xpu_subgraph_pass);
+USE_MIR_PASS(weight_quantization_preprocess_pass);
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -617,7 +617,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
  "fcmge v18.4s, v12.4s,  %[vzero].4s \n" /* vcgeq_f32 */                 \
  "fcmge v19.4s, v13.4s,  %[vzero].4s \n" /* vcgeq_f32 */                 \
  "fmul v20.4s, v12.4s, %[vscale].4s \n"  /* mul */                       \
-  "fmul v21.4s, v12.4s, %[vscale].4s \n"  /* mul */                       \
+  "fmul v21.4s, v13.4s, %[vscale].4s \n"  /* mul */                       \
  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/         \
                                                                          \
  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
@@ -1627,7 +1627,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                                         \
  "vbif q4, q6, q15 @ choose \n"                                         \
  "vcge.f32 q7, q5, %q[vzero]        @ q0 > 0 \n"                        \
-  "vmul.f32 q6, q4, q14 \n"                                              \
+  "vmul.f32 q6, q5, q14 \n"                                              \
  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
                                                                         \
@@ -1815,8 +1815,8 @@ void conv_depthwise_3x3s1_fp32(const float *din,
  "vmul.f32 q12, q14, q9 \n"                        \
  "vmul.f32 q13, q15, q9 \n"                        \
                                                    \
-  "vbif q14, q10, q12 \n"                           \
+  "vbif q14, q12, q10 \n"                           \
-  "vbif q15, q11, q13 \n"                           \
+  "vbif q15, q13, q11 \n"                           \
                                                    \
  "vst1.32 {d28-d29}, [%[out1]]\n"                  \
  "vst1.32 {d30-d31}, [%[out2]]\n"

--- a/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv5x5s2_depthwise_fp32.cc
@@ -209,9 +209,9 @@ namespace math {
  "fcmge v7.4s, v22.4s,  v0.4s \n"       /* vcgeq_f32 */ \
  "fmul  v8.4s, v22.4s, %[vscale].4s \n" /* mul */       \
  "bif  v19.16b, v2.16b, v1.16b \n"      /* choose*/     \
-  "bif  v19.16b, v4.16b, v3.16b \n"      /* choose*/     \
+  "bif  v20.16b, v4.16b, v3.16b \n"      /* choose*/     \
-  "bif  v19.16b, v6.16b, v5.16b \n"      /* choose*/     \
+  "bif  v21.16b, v6.16b, v5.16b \n"      /* choose*/     \
-  "bif  v19.16b, v8.16b, v7.16b \n"      /* choose*/
+  "bif  v22.16b, v8.16b, v7.16b \n"      /* choose*/
 #define STORE                            /* save result */ \
  "str q19, [%[outc0]], #16\n"                             \
  "str q20, [%[outc1]], #16\n"                             \

--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
@@ -360,12 +360,12 @@ __read_only image2d_t new_scale,
            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
 #endif
-#ifdef RELU
  output0 = activation_type4(output0);
  output1 = activation_type4(output1);
  output2 = activation_type4(output2);
  output3 = activation_type4(output3);
-#endif
  if (out_w0 < old_w) {
    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);

--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+#include <cl_common.h>
+__kernel void conv2d_5x5(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                         __read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+  const int filter_n0 = 4 * out_c + 0;
+  const int filter_n1 = 4 * out_c + 1;
+  const int filter_n2 = 4 * out_c + 2;
+  const int filter_n3 = 4 * out_c + 3;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh_in_one_batch;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+#ifdef BIASE_CH
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+  CL_DTYPE4 output = 0.0f;
+#endif
+  CL_DTYPE4 input;
+  CL_DTYPE4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                         in_pos_in_one_block.y + batch_index * input_height);
+    for (int j = 0; j < 5; j++) {
+      for (int k = 0; k < 5; k++) {
+        input = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
+                          (int2)(pos_in.x + (j - 2) * dilation,
+                                 pos_in.y + (k - 2) * dilation)),
+            (CL_DTYPE4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+        filter_pos0.x = filter_c * 5 + filter_w;
+        filter_pos0.y = filter_n0 * 5 + filter_h;
+        filter_pos1.x = filter_c * 5 + filter_w;
+        filter_pos1.y = filter_n1 * 5 + filter_h;
+        filter_pos2.x = filter_c * 5 + filter_w;
+        filter_pos2.y = filter_n2 * 5 + filter_h;
+        filter_pos3.x = filter_c * 5 + filter_w;
+        filter_pos3.y = filter_n3 * 5 + filter_h;
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+        //
+        //        if (output_pos.x == 0 && output_pos.y == 5) {
+        //          printf("i,j,k ={ %d, %d , %d }\n", i,j,k);
+        //          printf("in={ %f , %f , %f , %f } \n",
+        //                 convert_float(input.x),
+        //                 convert_float(input.y),
+        //                 convert_float(input.z),
+        //                 convert_float(input.w));
+        //          printf("filter0={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[0].x),
+        //                 convert_float(filter[0].y),
+        //                 convert_float(filter[0].z),
+        //                 convert_float(filter[0].w));
+        //          printf("filter1={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[1].x),
+        //                 convert_float(filter[1].y),
+        //                 convert_float(filter[1].z),
+        //                 convert_float(filter[1].w));
+        //          printf("filter2={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[2].x),
+        //                 convert_float(filter[2].y),
+        //                 convert_float(filter[2].z),
+        //                 convert_float(filter[2].w));
+        //          printf("filter3={ %f , %f , %f , %f } \n",
+        //                 convert_float(filter[3].x),
+        //                 convert_float(filter[3].y),
+        //                 convert_float(filter[3].z),
+        //                 convert_float(filter[3].w));
+        //          printf("output={ %f , %f , %f , %f } \n",
+        //                 convert_float(output.x),
+        //                 convert_float(output.y),
+        //                 convert_float(output.z),
+        //                 convert_float(output.w));
+        //        }
+      }
+    }
+  }
+#ifdef BATCH_NORM
+        output =
+            output * READ_IMG_TYPE(
+                         CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+        output = activation_type4(output);
+        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+      }
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+#include <cl_common.h>
+__kernel void conv2d_7x7(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter_image,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                         __read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
+      out_nh >= global_size_dim2) {
+    return;
+  }
+  const int batch_index = out_nh / output_height;
+  const int out_nh_in_one_batch = out_nh % output_height;
+  const filter_n0 = 4 * out_c + 0;
+  const filter_n1 = 4 * out_c + 1;
+  const filter_n2 = 4 * out_c + 2;
+  const filter_n3 = 4 * out_c + 3;
+  int2 stride_xy;
+  stride_xy.x = stride;
+  stride_xy.y = stride;
+  int2 ouput_pos_in_one_block;
+  ouput_pos_in_one_block.x = out_w;
+  ouput_pos_in_one_block.y = out_nh_in_one_batch;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 in_pos_in_one_block;
+  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+#ifdef BIASE_CH
+  CL_DTYPE4 output =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+  CL_DTYPE4 output = 0.0f;
+#endif
+  CL_DTYPE4 input;
+  CL_DTYPE4 filter[4];
+  int2 filter_pos0;
+  int2 filter_pos1;
+  int2 filter_pos2;
+  int2 filter_pos3;
+  for (int i = 0; i < input_c; ++i) {
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
+                         in_pos_in_one_block.y + batch_index * input_height);
+    for (int j = 0; j < 7; j++) {
+      for (int k = 0; k < 7; k++) {
+        input = select(
+            READ_IMG_TYPE(CL_DTYPE_CHAR,
+                          input_image,
+                          sampler,
+                          (int2)(pos_in.x + (j - 3) * dilation,
+                                 pos_in.y + (k - 3) * dilation)),
+            (CL_DTYPE4)(0.0f),
+            (ushort4)(
+                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
+                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
+                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
+                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
+                << 15));
+        int filter_h = k;
+        int filter_w = j;
+        int filter_c = i;
+        filter_pos0.x = filter_c * 7 + filter_w;
+        filter_pos0.y = filter_n0 * 7 + filter_h;
+        filter_pos1.x = filter_c * 7 + filter_w;
+        filter_pos1.y = filter_n1 * 7 + filter_h;
+        filter_pos2.x = filter_c * 7 + filter_w;
+        filter_pos2.y = filter_n2 * 7 + filter_h;
+        filter_pos3.x = filter_c * 7 + filter_w;
+        filter_pos3.y = filter_n3 * 7 + filter_h;
+        filter[0] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos0);
+        filter[1] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos1);
+        filter[2] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos2);
+        filter[3] =
+            READ_IMG_TYPE(CL_DTYPE_CHAR, filter_image, sampler, filter_pos3);
+        output.x += dot(input, filter[0]);
+        output.y += dot(input, filter[1]);
+        output.z += dot(input, filter[2]);
+        output.w += dot(input, filter[3]);
+      }
+    }
+  }
+#ifdef BATCH_NORM
+  output = output * READ_IMG_TYPE(
+                        CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+           READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+  output = activation_type4(output);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
--- a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
@@ -14,15 +14,72 @@ limitations under the License. */
 #include <cl_common.h>
-__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) {
+__kernel void elementwise_add(__read_only image2d_t input,
+                              __read_only image2d_t bias,
+                              __write_only image2d_t outputImage) {
     int x = get_global_id(0);
     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+     CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords);
+     CL_DTYPE4 output = activation_type4(in + biase);
+     WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage,coords,output);
+ }
+__kernel void channel_add(__read_only image2d_t input,
+                          __read_only image2d_t bias,
+                          __write_only image2d_t outputImage,
+                          int w) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
     int2 coords;
     coords.x = x;
     coords.y = y;
-     float4 in = read_imagef(input, sampler, coords);
-     float4 biase = read_imagef(bias, sampler, coords);
+     int2 coords_bias;
-     float4 output = in + biase;
+     coords_bias.x = x % w;
-     write_imagef(outputImage,coords,output);
+     coords_bias.y = 0;
+     CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+     CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+     CL_DTYPE4 output = in + (CL_DTYPE4)(biase.x);
+     WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
 }
+__kernel void width_add(__read_only image2d_t input,
+                        __read_only image2d_t bias,
+                        __write_only image2d_t outputImage,
+                        int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x % w;
+  coords_bias.y = 0;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output;
+  output.x = in.x + biase.x;
+  output.y = in.y + biase.x;
+  output.z = in.z + biase.x;
+  output.w = in.w + biase.x;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
--- a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cl_common.h>
+__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
+                              __write_only image2d_t outputImage) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords);
+  CL_DTYPE4 output = in * biase;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+__kernel void channel_mul_d1(__read_only image2d_t input, __read_only image2d_t bias,
+                             __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x % w;
+  coords_bias.y = 0;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * (CL_DTYPE4)(biase.x);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+__kernel void channel_mul_d2(__read_only image2d_t input, __read_only image2d_t bias,
+                             __write_only image2d_t outputImage, int w, int h) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x % w;
+  coords_bias.y = y % h;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * (CL_DTYPE4)(biase.x);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
+__kernel void channel_mul_d4(__read_only image2d_t input, __read_only image2d_t bias,
+                             __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
+  CL_DTYPE4 output = in * biase;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
+}
--- a/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/nearest_interp_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
+                             __private const float scale_h, __private const float scale_w,
+                             __private const int in_dims_h, __private const int out_dims_h,
+                             __private const int in_dims_w, __private const int out_dims_w) {
+                             const int c = get_global_id(0);
+                             const int w = get_global_id(1);
+                             const int nh = get_global_id(2);
+                             int2 output_pos;
+                             output_pos.x = c * out_dims_w + w;
+                             output_pos.y = nh;
+                             int out_n = nh / out_dims_h;
+                             int out_h = nh % out_dims_h;
+                             int2 input_pos;
+                             input_pos.x = c * in_dims_w + w / scale_w;
+                             input_pos.y = out_n * in_dims_h + out_h / scale_h;
+                             const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                             CLK_ADDRESS_CLAMP |
+                                                             CLK_FILTER_NEAREST;
+                             half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
+                             write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data);
+}
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -35,6 +35,7 @@ lite_cc_library(mir_passes
      demo_pass.cc
      runtime_context_assign_pass.cc
      memory_optimize_pass.cc
+      weight_quantization_preprocess_pass.cc
  DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
 # lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS

--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -100,14 +100,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  auto eps = matched.at("bn")->stmt()->op_info()->GetAttr<float>("epsilon");
  // conv
-  auto conv_weight_t = scope->FindVar(matched.at("conv_weight")->arg()->name)
+  std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
-                           ->GetMutable<lite::Tensor>();
+  auto conv_weight_t =
+      scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
  CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
           static_cast<size_t>(conv_weight_t->dims()[0]))
      << "The BN bias's size should be equal to the size of the first "
      << "dim size of the conv weights";
  size_t weight_num = conv_weight_t->data_size();
  bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
+  bool is_weight_quantization =
+      conv_op_desc->HasAttr("quantize_weight_bits") ? true : false;
  // comupte BN alpha and beta
  Tensor alpha_tensor, beta_tensor;
@@ -160,6 +163,16 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
      }
    }
    conv_op_desc->SetAttr("weight_scale", weight_scale);
+  } else if (is_weight_quantization) {
+    std::string scale_name = conv_weight_name + "_quant_scale";
+    if (conv_op_desc->HasAttr(scale_name)) {
+      auto scale = conv_op_desc->GetAttr<std::vector<float>>(scale_name);
+      CHECK_EQ(scale.size(), alpha_tensor.numel());
+      for (size_t i = 0; i < scale.size(); i++) {
+        scale[i] *= alpha_data[i];
+      }
+      conv_op_desc->SetAttr(scale_name, scale);
+    }
  } else {
    // compute new conv_weight
    auto conv_weight_d = conv_weight_t->mutable_data<float>();

--- a/lite/core/mir/weight_quantization_preprocess_pass.cc
+++ b/lite/core/mir/weight_quantization_preprocess_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/weight_quantization_preprocess_pass.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+void WeightQuantizationPreprocessPass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  std::vector<std::string> weight_quantized_op = {"conv2d", "depthwise_conv2d"};
+  for (auto& node : graph->StmtTopologicalOrder()) {
+    if (node->IsStmt() &&
+        std::find(weight_quantized_op.begin(),
+                  weight_quantized_op.end(),
+                  node->AsStmt().op_type()) != weight_quantized_op.end()) {
+      auto* scope = node->stmt()->op()->scope();
+      auto* op_desc = node->stmt()->mutable_op_info();
+      if (op_desc->HasAttr("quantize_weight_bits")) {
+        for (auto& input_name : op_desc->input_vars()) {
+          std::string scale_name = input_name + "_quant_scale";
+          if (op_desc->HasAttr(scale_name)) {
+            VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name;
+            auto input_tensor =
+                scope->FindVar(input_name)->GetMutable<lite::Tensor>();
+            int weight_out_channel = static_cast<int>(input_tensor->dims()[0]);
+            auto input_scale = op_desc->GetAttr<std::vector<float>>(scale_name);
+            // scale length is equal to weight out channel
+            std::vector<float> scale_list(weight_out_channel, input_scale[0]);
+            op_desc->SetAttr(scale_name, scale_list);
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(weight_quantization_preprocess_pass,
+                  paddle::lite::mir::WeightQuantizationPreprocessPass)
+    .BindTargets({TARGET(kAny)});
--- a/lite/core/mir/weight_quantization_preprocess_pass.h
+++ b/lite/core/mir/weight_quantization_preprocess_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include "lite/core/mir/pass.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+/*
+ * If the model is quantized by WeightQuantization in PostTrainingQuantization,
+ * the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is
+ * int, and the scale is save in the quantized ops.
+ * WeightQuantizationPreprocessPass obtains the scale value, expands the
+ * scale value to a list, and save the list in the quantized ops.
+ */
+class WeightQuantizationPreprocessPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -55,10 +55,11 @@ class Optimizer {
    if (passes.empty()) {
      std::vector<std::string> passes_local{
-          {"lite_quant_dequant_fuse_pass",     //
+          {"lite_quant_dequant_fuse_pass",         //
-           "lite_conv_elementwise_fuse_pass",  // conv-elemwise-bn
+           "weight_quantization_preprocess_pass",  //
-           "lite_conv_bn_fuse_pass",           //
+           "lite_conv_elementwise_fuse_pass",      // conv-elemwise-bn
-           "lite_conv_elementwise_fuse_pass",  // conv-bn-elemwise
+           "lite_conv_bn_fuse_pass",               //
+           "lite_conv_elementwise_fuse_pass",      // conv-bn-elemwise
           // TODO(Superjomn) Refine the fusion related design to select fusion
           // kernels for devices automatically.
           "lite_conv_activation_fuse_pass",              //

--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -37,22 +37,21 @@ value_type DDimLite::count(int start, int end) const {
  if (end < start) {
    return 0;
  }
-  value_type res = 1;
+  value_type sum = 1;
  for (auto i = start; i < end; ++i) {
-    res *= data_[i];
+    sum *= data_[i];
  }
-  return res;
+  return sum;
 }
 DDimLite DDimLite::Slice(int start, int end) const {
  start = std::max(start, 0);
  end = std::min(end, static_cast<int>(data_.size()));
-  DDimLite new_dim;
+  std::vector<value_type> new_dim(end - start);
-  new_dim.resize(end - start);
+  for (int i = start; i < end; i++) {
-  for (int i = start; i < end; ++i) {
    new_dim[i - start] = data_[i];
  }
-  return new_dim;
+  return DDim(new_dim);
 }
 std::string DDimLite::repr() const {

--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
@@ -204,7 +204,7 @@ class TensorLite {
    //    LOG(INFO) << "Set dims: " << dims_ << " for tensor " << this;
  }
  void Resize(const std::vector<int64_t> &x) {
-    dims_ = DDimLite(x);
+    dims_.ConstructFrom(x);
    //    LOG(INFO) << "Set dims: " << dims_ << " for tensor " << this;
  }
@@ -230,22 +230,7 @@ class TensorLite {
  // For other devices, T and R may be the same type.
  template <typename T, typename R = T>
  R *mutable_data() {
-    auto type_id = typeid(T).hash_code();
+    precision_ = lite_api::PrecisionTypeTrait<T>::Type();
-    if (type_id == typeid(bool).hash_code()) {  // NOLINT
-      precision_ = PrecisionType::kBool;
-    } else if (type_id == typeid(float).hash_code()) {  // NOLINT
-      precision_ = PrecisionType::kFloat;
-    } else if (type_id == typeid(int8_t).hash_code()) {
-      precision_ = PrecisionType::kInt8;
-    } else if (type_id == typeid(int16_t).hash_code()) {
-      precision_ = PrecisionType::kInt16;
-    } else if (type_id == typeid(int32_t).hash_code()) {
-      precision_ = PrecisionType::kInt32;
-    } else if (type_id == typeid(int64_t).hash_code()) {
-      precision_ = PrecisionType::kInt64;
-    } else {
-      precision_ = PrecisionType::kUnk;
-    }
    memory_size_ = dims_.production() * sizeof(T);
    buffer_->ResetLazy(target_, memory_size_);
    //    char *ptr = static_cast<char *>(buffer_->data()) + offset_;

--- a/lite/demo/cxx/Makefile.def
+++ b/lite/demo/cxx/Makefile.def
+# get the name of current operation system: Linux or Darwin
+SYSTEM=$(shell "uname")
 CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \
-	      -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING
+              -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING
 LDFLAGS = -latomic -pthread -ldl -llog -lz
-SYSROOT_COMPLILE = --sysroot=/opt/android-ndk-r17c/sysroot
+SYSROOT_COMPLILE = --sysroot=$(NDK_ROOT)/sysroot
-SYSTEM_INCLUDES = -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/include \
+SYSTEM_INCLUDES = -I$(NDK_ROOT)/sources/cxx-stl/llvm-libc++/include \
-	          -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++abi/include \
+                  -I$(NDK_ROOT)/sources/cxx-stl/llvm-libc++abi/include \
-	          -I/opt/android-ndk-r17c/sources/android/support/include \
+                  -I$(NDK_ROOT)/sources/android/support/include \
-	          -I/opt/android-ndk-r17c/sysroot/usr/include \
+                  -I$(NDK_ROOT)/sysroot/usr/include \
 ifeq ($(ARM_ABI), arm8)
-    CC = /opt/android-ndk-r17c/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++ 
+    ifeq ($(SYSTEM), Linux)
+        CC = $(NDK_ROOT)/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++
+    else
+        CC = $(NDK_ROOT)/toolchains/aarch64-linux-android-4.9/prebuilt/darwin-x86_64/bin/aarch64-linux-android-g++
+    endif
    CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
-    CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections 
+    CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections
-    SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-24/arch-arm64
+    SYSROOT_LINK = --sysroot=$(NDK_ROOT)/platforms/android-24/arch-arm64
-    SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \
+    SYSTEM_LIBS = $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a
+                  $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a
-    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android
+    INCLUDES = $(SYSTEM_INCLUDES) -I$(NDK_ROOT)/sysroot/usr/include/aarch64-linux-android
 else
-    CC = /opt/android-ndk-r17c/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++
+    ifeq ($(SYSTEM), Linux)
+        CC = $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++
+    else
+        CC = $(NDK_ROOT)/toolchains/arm-linux-androideabi-4.9/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-g++
+    endif
    CXX_FLAGS = -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp -funwind-tables -no-canonical-prefixes \
-		-D__ANDROID_API__=23 -fexceptions -frtti  -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
+               -D__ANDROID_API__=23 -fexceptions -frtti  -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
    CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--fix-cortex-a8 -Wl,--gc-sections -Wl,-z,nocopyreloc
-    SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-23/arch-arm
+    SYSROOT_LINK = --sysroot=$(NDK_ROOT)/platforms/android-23/arch-arm
-    SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_static.a \
+    SYSTEM_LIBS = $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_static.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \
+                  $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \
+                  $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \
-                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a
+                  $(NDK_ROOT)/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a
-    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi
+    INCLUDES = $(SYSTEM_INCLUDES) -I$(NDK_ROOT)/sysroot/usr/include/arm-linux-androideabi
 endif
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -2,34 +2,40 @@ if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
    return ()
 endif()
-set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper)
+set(cl_kernel_deps op_params cl_runtime cl_context cl_wrapper cl_target_wrapper cl_image_converter)
 add_kernel(fc_opencl OPENCL basic SRCS fc_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(mul_opencl OPENCL basic SRCS mul_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(elementwise_add_opencl OPENCL basic SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(elementwise_mul_opencl OPENCL basic SRCS elementwise_mul_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(fusion_elementwise_add_activation_opencl
-        OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc
+           OPENCL basic SRCS fusion_elementwise_add_activation_compute.cc
-        DEPS elementwise_add_opencl ${cl_kernel_deps})
+           DEPS elementwise_add_opencl ${cl_kernel_deps})
 add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
 add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
 #add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps} cl_image_converter)
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(nearest_interp_opencl OPENCL basic SRCS nearest_interp_compute.cc DEPS ${cl_kernel_deps})
 lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
-        DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
+             DEPS elementwise_add_opencl fusion_elementwise_add_activation_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_elementwise_mul_opencl SRCS elementwise_mul_compute_test.cc
+             DEPS elementwise_mul_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc
-        DEPS pool_opencl op_registry program context cl_image_converter
+             DEPS pool_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
-        DEPS fc_opencl op_registry program context
+             DEPS fc_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 # TODO(ysh329): comment for buffer-impl mul
 #lite_cc_test(test_mul_opencl SRCS mul_compute_test.cc
@@ -37,34 +43,41 @@ lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
 #        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc
-        DEPS io_copy_compute_opencl op_registry program context
+             DEPS io_copy_compute_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 #TODO(ysh329): comment buffer-impl relu
 lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
-        DEPS relu_opencl layout_opencl op_registry program context
+             DEPS relu_opencl layout_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
-        DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter
+             DEPS depthwise_conv2d_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_depthwise_conv2d_basic_opencl SRCS depthwise_conv2d_basic_compute_test.cc
-        DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter
+             DEPS depthwise_conv2d_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 #lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc
-#        DEPS conv2d_1x1_opencl cl_image_converter op_registry program context
+#             DEPS conv2d_1x1_opencl op_registry program context
-#        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+#             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
-        DEPS reshape_opencl cl_image_converter op_registry program context
+             DEPS reshape_opencl op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
-        DEPS conv_opencl op_registry program context
+             DEPS conv_opencl op_registry program context
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_conv_image2d_opencl SRCS conv_image2d_compute_test.cc
+        DEPS conv_opencl op_registry program context cl_image_converter
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
        DEPS layout_opencl op_registry program context cl_image_converter
        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_nearest_interp_opencl SRCS nearest_interp_compute_test.cc
+        DEPS nearest_interp_opencl layout_opencl op_registry program context cl_image_converter
+        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
@@ -13,9 +13,13 @@
 // limitations under the License.
 #include "lite/kernels/opencl/conv_compute.h"
 #include <sstream>
+#include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 namespace paddle {
@@ -242,7 +246,6 @@ void ConvCompute::Conv2d1x1() {
  GemmBatched(kernel, x_d, filter_d, bias_d, output_d, batch_size, m, n, k);
 }
 // a: filter_d ==> <m, k> <=> <oc, ic>
 // b: x_d      ==> <k, n> <=> <ic, ih*iw>
 // c: output_d ==> <m, n> <=> <oc, ih*iw>
@@ -294,6 +297,582 @@ void ConvCompute::GemmBatched(cl::Kernel& kernel,
 void ConvCompute::Run() { (this->*impl_)(); }
+/* image kernel*/
+void ConvImageCompute::PrepareForRun() {
+  const auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  float* filter_cpu = param.filter->mutable_data<float>();
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  int bs = x_dims[0];
+  int c_in = x_dims[1];
+  int h_out = output_dims[2];
+  int w_out = output_dims[3];
+  int kernel_h = filter_dims[2];  // oihw
+  int kernel_w = filter_dims[3];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
+  int groups = param.groups;
+  bool relu_fused = param.fuse_relu;
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  bool zero_pad = (pad_h == 0) && (pad_w == 0);
+  bool pad_equal =
+      ((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) &&
+       (paddings[2] == paddings[3]));
+  bool stride_equal = stride_h == stride_w;
+  bool dilation_equal = dilations[0] == dilations[1];
+  CHECK(pad_equal && stride_equal && dilation_equal);
+  VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
+  VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
+          << " stride_w:" << stride_w << " pad_h:" << pad_h
+          << " pad_w:" << pad_w << " kernel_h:" << kernel_h
+          << " kernel_h:" << kernel_h;
+  VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
+          << " " << x_dims[3];
+  VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
+          << output_dims[2] << " " << output_dims[3];
+  VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
+          << filter_dims[2] << " " << filter_dims[3];
+  if (kernel_h == 1 && kernel_w == 1) {
+    // conv2d_1x1
+    if (param.x->dims()[1] % 4 == 0) {
+      kernel_func_names_.push_back("conv2d_1x1_simple");
+    } else {
+      kernel_func_names_.push_back("conv2d_1x1");
+    }
+    kernel_func_paths_.push_back("image/conv2d_1x1_kernel.cl");
+    CLImageConverterNWBlock converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+    impl_ = &ConvImageCompute::Conv2d1x1;
+  } else if (kernel_h == 5 && kernel_w == 5) {
+    // conv2d_5x5
+    kernel_func_names_.push_back("conv2d_5x5");
+    kernel_func_paths_.push_back("image/conv2d_5x5_kernel.cl");
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+    impl_ = &ConvImageCompute::Conv2d5x5;
+  } else if (kernel_h == 7 && kernel_w == 7) {
+    // conv2d_7x7
+    kernel_func_names_.push_back("conv2d_7x7");
+    kernel_func_paths_.push_back("image/conv2d_7x7_kernel.cl");
+    CLImageConverterFolder converter;
+    const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
+    std::vector<float> filter_image_v(filter_image_dims[0] *
+                                      filter_image_dims[1] * 4);  // 4 : RGBA
+    converter.NCHWToImage(filter_cpu, filter_image_v.data(), filter_dims);
+    this->filter_gpu_image_.mutable_data<float, cl::Image2D>(
+        filter_image_dims[0], filter_image_dims[1], filter_image_v.data());
+    impl_ = &ConvImageCompute::Conv2d7x7;
+  } else {
+    LOG(FATAL) << "conv image compute not support this condition yet! ";
+  }
+  std::string build_options_single(" -DCL_DTYPE_float");
+  // relu options
+  if (relu_fused) {
+    build_options_single += " -DRELU";
+  } else if (param.activation_param.active_type ==
+             lite_api::ActivationType::kRelu6) {
+    build_options_single += " -DRELU6";
+  } else {
+    // do nothing
+  }
+  // bias options
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  if (has_bias) {
+    build_options_single +=
+        is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
+    // convert cpu buffer bias --> gpu image
+    CLImageConverterFolder bias_converter;
+    const DDim& bias_image_dims =
+        bias_converter.InitImageDimInfoWith(param.bias->dims());
+    std::vector<float> bias_image_v(bias_image_dims[0] * bias_image_dims[1] *
+                                    4);
+    float* bias_cpu_data = param.bias->mutable_data<float>();
+    bias_converter.NCHWToImage(
+        bias_cpu_data, bias_image_v.data(), param.bias->dims());
+    this->bias_gpu_image_.mutable_data<float, cl::Image2D>(
+        bias_image_dims[0], bias_image_dims[1], bias_image_v.data());
+    // convert cpu buffer bias --> gpu image --- end ----
+  }
+  build_options_.push_back(build_options_single);
+  for (size_t i = 0; i < kernel_func_names_.size(); i++) {
+    context.cl_context()->AddKernel(
+        kernel_func_names_[i], kernel_func_paths_[i], build_options_[i]);
+  }
+}
+void ConvImageCompute::Conv2d1x1() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  VLOG(4) << "============ conv2d_1x1 params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+  // handle bias  use buffer for channel wise , use image for element wise
+  const cl::Buffer* bias_buf = nullptr;
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  std::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  int maped_w = maptofactor(w, 4);
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "maped_w: " << maped_w;
+  VLOG(4) << "hasbias: " << has_bias;
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, maped_w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(maped_w),
+                  static_cast<size_t>(default_work_size.data()[2])};
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+void ConvImageCompute::Conv2d5x5() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+void ConvImageCompute::Conv2d7x7() {
+  const auto& param = *param_.get_mutable<param_t>();
+  auto input_dims = param.x->dims();
+  auto paddings = *param.paddings;
+  auto strides = param.strides;
+  auto* input_image = param.x->data<float, cl::Image2D>();
+  auto* filter_image = filter_gpu_image_.data<float, cl::Image2D>();
+  auto filter_dims = param.filter->dims();
+  auto output_dims = param.output->dims();
+  int input_width = input_dims[3];
+  int input_height = input_dims[2];
+  int output_width = output_dims[3];
+  int output_height = output_dims[2];
+  int filter_width = filter_dims[3];
+  int filter_height = filter_dims[2];
+  auto out_image_shape = InitImageDimInfoWith(output_dims);
+  auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+      out_image_shape["width"], out_image_shape["height"]);
+  const bool has_bias = param.bias != nullptr;
+  const bool is_element_wise_bias =
+      has_bias && param.output->dims() == param.bias->dims();
+  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+               static_cast<int>(paddings[0]);
+  // calc input_c_block
+  auto input_image_shape = InitImageDimInfoWith(input_dims);
+  int input_c_block = input_image_shape["width"] / input_dims[3];
+  int input_c = input_dims[1];
+  auto dilations = *param.dilations;
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(out_image_shape["width"]),
+                          static_cast<int64_t>(out_image_shape["height"])}));
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  VLOG(4) << "============ conv2d params ============";
+  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+          << input_image_shape["height"];
+  VLOG(4) << "input_c_block: " << input_c_block;
+  VLOG(4) << "input_c: " << input_c;
+  VLOG(4) << "input_image: " << input_image;
+  VLOG(4) << "input_dims: " << input_dims;
+  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "filter_image: " << filter_image;
+  VLOG(4) << "output_dims: " << output_dims;
+  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+          << out_image_shape["height"];
+  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+  VLOG(4) << "offset: " << offset;
+  VLOG(4) << "dilations.size : " << dilations.size();
+  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  VLOG(4) << "default work size{c_block, w, nh}: "
+          << "{" << c_block << ", " << w << ", " << nh << ""
+          << "}";
+  CHECK_GE(dilations.size(), 2);
+  CHECK(dilations[0] == dilations[1]);
+  CHECK_GE(input_dims.size(), 4);
+  CHECK_GE(paddings.size(), 2);
+  CHECK(paddings[0] == paddings[1]);
+  CHECK_GE(strides.size(), 2);
+  CHECK(strides[0] == strides[1]);
+  const cl::Image2D* bias_image = nullptr;
+  if (has_bias) {
+    bias_image = bias_gpu_image_.data<float, cl::Image2D>();
+  }
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_names_[0] << build_options_[0];
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  VLOG(4) << "kernel_key: " << kernel_key.str();
+  VLOG(4) << "kernel ready ... " << kernel_key.str();
+  VLOG(4) << "w: " << w;
+  cl_int status;
+  int arg_idx = 0;
+  status = kernel.setArg(arg_idx, c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, w);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, nh);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *input_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, *filter_image);
+  CL_CHECK_FATAL(status);
+  if (has_bias) {
+    VLOG(4) << "set bias_image: ";
+    status = kernel.setArg(++arg_idx, *bias_image);
+    CL_CHECK_FATAL(status);
+  }
+  status = kernel.setArg(++arg_idx, *out_image);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, strides[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, offset);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_c_block);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, dilations[0]);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, input_height);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_width);
+  CL_CHECK_FATAL(status);
+  status = kernel.setArg(++arg_idx, output_height);
+  CL_CHECK_FATAL(status);
+  auto global_work_size =
+      cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                  static_cast<size_t>(default_work_size.data()[1]),
+                  static_cast<size_t>(default_work_size.data()[2])};
+  VLOG(4) << "out_image: " << out_image;
+  VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+          << global_work_size[1] << "," << global_work_size[2] << "}";
+  status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_image, event_);
+}
+void ConvImageCompute::Run() { (this->*impl_)(); }
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
@@ -310,3 +889,21 @@ REGISTER_LITE_KERNEL(conv2d,
    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
    .Finalize();
+REGISTER_LITE_KERNEL(conv2d,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ConvImageCompute,
+                     image2d)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/conv_compute.h
+++ b/lite/kernels/opencl/conv_compute.h
@@ -17,6 +17,7 @@
 #include <memory>
 #include <string>
 #include <vector>
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/tensor.h"
@@ -57,6 +58,30 @@ class ConvCompute
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };
+class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
+                                           PRECISION(kFloat),
+                                           DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConvParam;
+  using kernel_t = void (ConvImageCompute::*)();
+  void PrepareForRun() override;
+  void Run() override;
+ private:
+  void Conv2d1x1();
+  void Conv2d5x5();
+  void Conv2d7x7();
+  kernel_t impl_;
+  std::vector<std::string> kernel_func_names_{};
+  std::vector<std::string> kernel_func_paths_{};
+  std::vector<std::string> build_options_{};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+  Tensor filter_gpu_image_;
+  Tensor bias_gpu_image_;
+};
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/opencl/conv_image2d_compute_test.cc
+++ b/lite/kernels/opencl/conv_image2d_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+// #define SHADOW_LOG LOG(INFO)
+#define SHADOW_LOG VLOG(4)
+template <typename Dtype1, typename Dtype2>
+static void conv_basic(const Dtype1* din,
+                       Dtype2* dout,
+                       int num,
+                       int chout,
+                       int hout,
+                       int wout,
+                       int chin,
+                       int hin,
+                       int win,
+                       const Dtype1* weights,
+                       const Dtype2* bias,
+                       int group,
+                       int kernel_w,
+                       int kernel_h,
+                       int stride_w,
+                       int stride_h,
+                       int dila_w,
+                       int dila_h,
+                       int pad_w,
+                       int pad_h,
+                       bool flag_bias,
+                       std::string flag_relu) {
+  Dtype2 beta = 0;
+  auto src_data = din;
+  auto dst_data_ref = dout;
+  auto weights_data = weights;
+  auto with_bias = flag_bias;
+  auto bias_data = bias;
+  int in_num = num;
+  int out_channels = chout;
+  int out_h = hout;
+  int out_w = wout;
+  int in_channel = chin;
+  int in_h = hin;
+  int in_w = win;
+  int out_c_group = out_channels / group;
+  int in_c_group = in_channel / group;
+  for (int n = 0; n < in_num; ++n) {
+    for (int g = 0; g < group; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < out_h; ++oh) {
+          for (int ow = 0; ow < out_w; ++ow) {
+            int out_idx = n * group * out_c_group * out_h * out_w +
+                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
+                          oh * out_w + ow;
+            Dtype2 bias_d =
+                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
+            dst_data_ref[out_idx] = bias_d;  // + dst_data_ref[out_idx] * beta;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - pad_w + kw * (dila_w);
+                  int ih = oh * stride_h - pad_h + kh * (dila_h);
+                  if (iw < 0 || iw >= in_w) continue;
+                  if (ih < 0 || ih >= in_h) continue;
+                  int iidx = n * in_channel * in_h * in_w +
+                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
+                             ih * in_w + iw;
+                  int widx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
+                }
+              }
+            }
+            if (flag_relu == "relu") {
+              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
+                                          ? dst_data_ref[out_idx]
+                                          : (Dtype2)0;
+            } else if (flag_relu == "relu6") {
+              auto dst_tmp = (dst_data_ref[out_idx] > (Dtype2)0)
+                                 ? dst_data_ref[out_idx]
+                                 : (Dtype2)0;
+              dst_data_ref[out_idx] = (dst_tmp < 6.f) ? dst_tmp : 6.f;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+int ConvOutputSize(int input_size,
+                   int filter_size,
+                   int dilation,
+                   int pad_left,
+                   int pad_right,
+                   int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size =
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
+  return output_size;
+}
+// #define PRINT_RESULT
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_1x1) {
+  // conv infos
+  const int ksize = 1;
+  const int stride = 1;
+  const int pad = 0;
+  const int group = 1;
+  const int dilation = 0;
+//  int loop_cnt = 0;
+#ifdef LOOP_TEST
+  for (int batch_size = 1; batch_size < 4; ++batch_size) {
+    for (int oc = 4; oc < 10; oc += 1) {   // oc
+      for (int ih = 4; ih < 9; ih += 1) {  // ih
+        int iw = ih;
+        for (int iw = 4; iw < 10; iw += 1) {    // iw
+          for (int ic = 4; ic < 10; ic += 1) {  // ic
+            for (bool bias_flag : {true, false}) {
+              for (std::string relu_flag : {"relu"}) {
+#else
+  const int batch_size = 1;
+  const int oc = 4;
+  const int ih = 8;
+  const int iw = 8;
+  const int ic = 4;
+  const bool bias_flag = true;
+  const std::string relu_flag = "relu";
+#endif
+                const int oh = ih;
+                const int ow = iw;
+                SHADOW_LOG << "to get kernel ...";
+                auto kernels =
+                    KernelRegistry::Global().Create("conv2d",
+                                                    TARGET(kOpenCL),
+                                                    PRECISION(kFloat),
+                                                    DATALAYOUT(kImageDefault));
+                ASSERT_FALSE(kernels.empty());
+                auto kernel = std::move(kernels.front());
+                SHADOW_LOG << "created conv2d_1x1 kernel";
+                SHADOW_LOG << "prepare kernel ------";
+                lite::Tensor input, filter, bias, output;
+                operators::ConvParam param;
+                param.x = &input;
+                param.filter = &filter;
+                param.output = &output;
+                if (bias_flag) {
+                  param.bias = &bias;
+                }
+                if (relu_flag == "relu") {
+                  param.fuse_relu = true;
+                } else if (relu_flag == "None") {
+                  param.fuse_relu = false;
+                } else if (relu_flag == "relu6") {
+                  param.activation_param.Relu_clipped_coef = 6.f;
+                  param.activation_param.has_active = true;
+                  param.activation_param.active_type =
+                      lite_api::ActivationType::kRelu6;
+                }
+                std::vector<int> paddings = {pad, pad, pad, pad};
+                std::vector<int> dilations = {dilation, dilation};
+                param.paddings = std::make_shared<std::vector<int>>(paddings);
+                param.dilations = std::make_shared<std::vector<int>>(dilations);
+                param.strides = std::vector<int>{stride, stride};
+                std::unique_ptr<KernelContext> context(new KernelContext);
+                context->As<OpenCLContext>().InitOnce();
+                std::unique_ptr<KernelContext> conv_1x1_context(
+                    new KernelContext);
+                context->As<OpenCLContext>().CopySharedTo(
+                    &(conv_1x1_context->As<OpenCLContext>()));
+                kernel->SetContext(std::move(conv_1x1_context));
+                const DDim& input_dim =
+                    lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+                const DDim& filter_dim =
+                    lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
+                const DDim& out_dim =
+                    lite::DDim{std::vector<int64_t>({batch_size, oc, ih, iw})};
+                // element wise bias
+                const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+                param.x->Resize(input_dim);
+                param.filter->Resize(filter_dim);
+                param.output->Resize(out_dim);
+                if (bias_flag) {
+                  param.bias->Resize(bias_dim);
+                }
+                kernel->SetParam(param);
+                size_t input_image_width = iw * ((ic + 3) / 4);
+                size_t input_image_height = ih * batch_size;
+                size_t out_image_width = ow * ((oc + 3) / 4);
+                size_t out_image_height = oh * batch_size;
+                size_t bias_image_width = ow * ((oc + 3) / 4);
+                size_t bias_image_height = oh * batch_size;
+                size_t filter_image_width = ksize * ((oc + 3) / 4);
+                size_t filter_image_height = ic * ksize;
+                const size_t cl_image2d_row_pitch{0};
+                const size_t cl_image2d_slice_pitch{0};
+                std::default_random_engine engine;
+                std::uniform_real_distribution<float> gen(-5, 5);
+                std::vector<float> input_v(batch_size * ic * ih * iw);
+                std::vector<float> filter_v(oc * ic * ksize * ksize);
+                std::vector<float> output_v(batch_size * oc * ih * iw);
+                std::vector<float> bias_v(oc);
+                SHADOW_LOG << "gen input and filter ...";
+                for (auto& i : input_v) {
+                  i = gen(engine);
+                }
+                for (auto& f : filter_v) {
+                  f = gen(engine);
+                }
+                SHADOW_LOG << "after gen input and filter ...";
+                SHADOW_LOG << "input_v.size(): " << input_v.size();
+                SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+                SHADOW_LOG << "output_v.size(): " << output_v.size();
+                SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+                SHADOW_LOG << "input_dim.production(): "
+                           << input_dim.production();
+                SHADOW_LOG << "filter_dim.production(): "
+                           << filter_dim.production();
+                SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+                SHADOW_LOG << "bias_dim.production(): "
+                           << bias_dim.production();
+                SHADOW_LOG << "4 * input_image_height * input_image_width: "
+                           << 4 * input_image_height * input_image_width;
+                SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                           << 4 * filter_image_width * filter_image_height;
+                CHECK(input_dim.production() == input_v.size());
+                CHECK_LE(input_dim.production(),
+                         4 * input_image_height * input_image_width);
+                CHECK(filter_dim.production() == filter_v.size());
+                CHECK_LE(filter_dim.production(),
+                         4 * filter_image_width * filter_image_height);
+                paddle::lite::CLImageConverterDefault default_convertor;
+                SHADOW_LOG << "set mapped input  ...";
+                std::vector<float> x_image_v(
+                    input_image_width * input_image_height * 4);  // 4 : RGBA
+                std::vector<float> filter_image_v(
+                    filter_image_width * filter_image_height * 4);  // 4 :RGBA
+                std::vector<float> bias_image_v(
+                    bias_image_width * bias_image_height * 4);  // 4 : RGBA
+                std::vector<float> out_image_v(
+                    out_image_width * out_image_height * 4);  // 4 : RGBA
+                default_convertor.NCHWToImage(
+                    input_v.data(), x_image_v.data(), input_dim);
+                SHADOW_LOG << "set mapped filter  ...";
+                paddle::lite::CLImageConverterNWBlock nw_convertor;
+                nw_convertor.NCHWToImage(
+                    filter_v.data(), filter_image_v.data(), filter_dim);
+                auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                    input_image_width, input_image_height, x_image_v.data());
+                // assign filter as target arm
+                filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                               filter_dim);
+                //                auto* filter_image2d =
+                //                filter.mutable_data<float, cl::Image2D>(
+                //                    filter_image_width,
+                //                    filter_image_height,
+                //                    filter_image_v.data());
+                SHADOW_LOG << "卷积核: ----  ";
+                for (int i = 0; i < filter_v.size(); i++) {
+                  SHADOW_LOG << "(" << i << ")" << filter_v[i];
+                }
+                SHADOW_LOG << "卷积核1: ----  ";
+                const float* filter_p = filter.data<float>();
+                for (int i = 0; i < filter_v.size(); i++) {
+                  SHADOW_LOG << "(" << i << ")" << *filter_p;
+                  filter_p++;
+                }
+                SHADOW_LOG << "卷积核2: ----  ";
+                const float* filter_p2 = filter.mutable_data<float>();
+                for (int i = 0; i < filter_v.size(); i++) {
+                  SHADOW_LOG << "(" << i << ")" << *filter_p2;
+                  filter_p2++;
+                }
+                if (bias_flag) {
+                  for (int i = 0; i < bias_dim.production(); ++i) {
+                    bias_v[i] = static_cast<int>(gen(engine));
+                  }
+                  bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                               bias_dim);
+                  //                CLImageConverterFolder folder_convertor;
+                  //                folder_convertor.NCHWToImage(
+                  //                    bias_v.data(), bias_image_v.data(),
+                  //                    bias_dim);
+                  //
+                  //                auto* bias_data = bias.mutable_data<float,
+                  //                cl::Image2D>(
+                  //                    bias_image_width, bias_image_height,
+                  //                    bias_image_v.data());
+                }
+                SHADOW_LOG << "resize output  ...";
+                output.Resize(out_dim);
+                // cpu conv basic calc
+                lite::Tensor out_ref;
+                out_ref.Resize(out_dim);
+                SHADOW_LOG << "prepare kernel ready";
+                SHADOW_LOG << "kernel launch ...";
+                kernel->Launch();
+                SHADOW_LOG << "mutable output ...";
+                auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                    out_image_width, out_image_height);
+                auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+                auto* out_ptr = param.output->data<float, cl::Image2D>();
+                auto it = wait_list->find(out_ptr);
+                if (it != wait_list->end()) {
+                  SHADOW_LOG << "--- Find the sync event for the target cl "
+                                "tensor. ---";
+                  auto& event = *(it->second);
+                  event.wait();
+                } else {
+                  LOG(FATAL) << "Could not find the sync event for the target"
+                                "cl tensor.";
+                }
+                TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                            output.data<float, cl::Image2D>(),
+                                            out_image_width,
+                                            out_image_height,
+                                            cl_image2d_row_pitch,
+                                            cl_image2d_slice_pitch,
+                                            IoDirection::DtoH);
+                DDim out_image_shape =
+                    default_convertor.InitImageDimInfoWith(output.dims());
+                default_convertor.ImageToNCHW(out_image_v.data(),
+                                              output_v.data(),
+                                              out_image_shape,
+                                              output.dims());
+                SHADOW_LOG << "mutable_data out_ref_data: ";
+                // run cpu ref
+                auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+                SHADOW_LOG << " conv_basic beigin ..... ";
+                conv_basic<float, float>(input_v.data(),
+                                         out_ref_data,
+                                         batch_size,
+                                         oc,
+                                         oh,
+                                         ow,
+                                         ic,
+                                         ih,
+                                         iw,
+                                         filter_v.data(),
+                                         bias_v.data(),  // mapped_bias,
+                                         group,
+                                         ksize,
+                                         ksize,
+                                         stride,
+                                         stride,
+                                         dilation,
+                                         dilation,
+                                         pad,
+                                         pad,
+                                         bias_flag,
+                                         relu_flag);
+                SHADOW_LOG << " conv_basic end ..... ";
+                SHADOW_LOG << " out_dim: " << out_dim;
+                const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                    {static_cast<int64_t>(out_image_width),
+                     static_cast<int64_t>(out_image_height)})};
+                for (int i = 0; i < out_dim.production(); i++) {
+                  EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                  if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                    LOG(FATAL) << "error idx:" << i;
+                  }
+                }
+#ifdef LOOP_TEST
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
+// #define PRINT_RESULT
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_5x5) {
+  // conv infos
+  const int ksize = 5;
+  const int stride = 1;
+  const int pad = 2;
+  const int group = 1;
+  const int dilation = 1;
+//  int loop_cnt = 0;
+#ifdef LOOP_TEST
+  for (int batch_size = 2; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 10; oc += 1) {   // oc
+      for (int ih = 5; ih < 9; ih += 1) {  // ih
+        int iw = ih;
+        for (int ic = 1; ic < 10; ic += 1) {  // ic
+          for (bool bias_flag : {true, false}) {
+            for (std::string relu_flag : {/*true,*/ "relu"}) {
+#else
+                const int batch_size = 2;
+                const int oc = 1;
+                const int ih = 5;
+                const int iw = 5;
+                const int ic = 1;
+                const bool bias_flag = true;
+                const std::string relu_flag = "relu";
+#endif
+              const int oh =
+                  ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
+              const int ow =
+                  ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
+              SHADOW_LOG << "to get kernel ...";
+              auto kernels =
+                  KernelRegistry::Global().Create("conv2d",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(kernels.empty());
+              auto kernel = std::move(kernels.front());
+              SHADOW_LOG << "created conv2d kernel";
+              SHADOW_LOG << "prepare kernel ------";
+              lite::Tensor input, filter, bias, output;
+              operators::ConvParam param;
+              param.x = &input;
+              param.filter = &filter;
+              param.output = &output;
+              if (bias_flag) {
+                param.bias = &bias;
+              }
+              if (relu_flag == "relu") {
+                param.fuse_relu = true;
+              } else if (relu_flag == "None") {
+                param.fuse_relu = false;
+              } else if (relu_flag == "relu6") {
+                param.activation_param.Relu_clipped_coef = 6.f;
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu6;
+              }
+              std::vector<int> paddings = {pad, pad, pad, pad};
+              std::vector<int> dilations = {dilation, dilation};
+              param.paddings = std::make_shared<std::vector<int>>(paddings);
+              param.dilations = std::make_shared<std::vector<int>>(dilations);
+              param.strides = std::vector<int>{stride, stride};
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+              std::unique_ptr<KernelContext> conv_1x1_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(conv_1x1_context->As<OpenCLContext>()));
+              kernel->SetContext(std::move(conv_1x1_context));
+              const DDim& input_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+              const DDim& filter_dim =
+                  lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
+              const DDim& out_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+              // element wise bias
+              const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+              param.x->Resize(input_dim);
+              param.filter->Resize(filter_dim);
+              param.output->Resize(out_dim);
+              if (bias_flag) {
+                param.bias->Resize(bias_dim);
+              }
+              kernel->SetParam(param);
+              size_t input_image_width = iw * ((ic + 3) / 4);
+              size_t input_image_height = ih * batch_size;
+              size_t out_image_width = ow * ((oc + 3) / 4);
+              size_t out_image_height = oh * batch_size;
+              size_t bias_image_width = ow * ((oc + 3) / 4);
+              size_t bias_image_height = oh * batch_size;
+              size_t filter_image_width = ksize * ((ic + 3) / 4);
+              size_t filter_image_height = oc * ksize;
+              const size_t cl_image2d_row_pitch{0};
+              const size_t cl_image2d_slice_pitch{0};
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> gen(-5, 5);
+              std::vector<float> input_v(batch_size * ic * ih * iw);
+              std::vector<float> filter_v(oc * ic * ksize * ksize);
+              std::vector<float> output_v(batch_size * oc * oh * ow);
+              std::vector<float> bias_v(oc);
+              SHADOW_LOG << "gen input and filter ...";
+              for (auto& i : input_v) {
+                i = gen(engine);
+              }
+              for (auto& f : filter_v) {
+                f = gen(engine);
+              }
+              SHADOW_LOG << "after gen input and filter ...";
+              SHADOW_LOG << "input_v.size(): " << input_v.size();
+              SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+              SHADOW_LOG << "output_v.size(): " << output_v.size();
+              SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+              SHADOW_LOG << "input_dim.production(): "
+                         << input_dim.production();
+              SHADOW_LOG << "filter_dim.production(): "
+                         << filter_dim.production();
+              SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+              SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
+              SHADOW_LOG << "4 * input_image_height *input_image_width: "
+                         << 4 * input_image_height * input_image_width;
+              SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                         << 4 * filter_image_width * filter_image_height;
+              CHECK(input_dim.production() == input_v.size());
+              CHECK_LE(input_dim.production(),
+                       4 * input_image_height * input_image_width);
+              CHECK(filter_dim.production() == filter_v.size());
+              CHECK_LE(filter_dim.production(),
+                       4 * filter_image_width * filter_image_height);
+              paddle::lite::CLImageConverterDefault default_convertor;
+              SHADOW_LOG << "set mapped input  ...";
+              std::vector<float> x_image_v(input_image_width *
+                                           input_image_height * 4);  // 4 :RGBA
+              std::vector<float> filter_image_v(
+                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
+              std::vector<float> bias_image_v(
+                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
+              std::vector<float> out_image_v(out_image_width *
+                                             out_image_height * 4);  // 4 :RGBA
+              default_convertor.NCHWToImage(
+                  input_v.data(), x_image_v.data(), input_dim);
+              SHADOW_LOG << "输入: ----  ";
+              for (int i = 0; i < input_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << input_v[i];
+              }
+              SHADOW_LOG << "输入image : ----  ";
+              for (int i = 0; i < x_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << x_image_v[i];
+              }
+              SHADOW_LOG << "set mapped filter  ...";
+              CLImageConverterFolder folder_convertor;
+              folder_convertor.NCHWToImage(
+                  filter_v.data(), filter_image_v.data(), filter_dim);
+              SHADOW_LOG << "卷积核: ----  ";
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_v[i];
+              }
+              SHADOW_LOG << "卷积核image: ----  ";
+              for (int i = 0; i < filter_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
+              }
+              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                  input_image_width, input_image_height, x_image_v.data());
+              // assign filter as target arm
+              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                             filter_dim);
+              // filter kernel
+              //              auto* filter_image2d = filter.mutable_data<float,
+              //              cl::Image2D>(
+              //                  filter_image_width,
+              //                  filter_image_height,
+              //                  filter_image_v.data());
+              if (bias_flag) {
+                for (int i = 0; i < bias_dim.production(); ++i) {
+                  bias_v[i] = static_cast<int>(gen(engine));
+                }
+                bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                             bias_dim);
+                //                CLImageConverterFolder folder_convertor;
+                //                folder_convertor.NCHWToImage(
+                //                    bias_v.data(), bias_image_v.data(),
+                //                    bias_dim);
+                //
+                //                auto* bias_data = bias.mutable_data<float,
+                //                cl::Image2D>(
+                //                    bias_image_width, bias_image_height,
+                //                    bias_image_v.data());
+              }
+              SHADOW_LOG << "resize output  ...";
+              output.Resize(out_dim);
+              // cpu conv basic calc
+              lite::Tensor out_ref;
+              out_ref.Resize(out_dim);
+              SHADOW_LOG << "prepare kernel ready";
+              SHADOW_LOG << "kernel launch ...";
+              kernel->Launch();
+              SHADOW_LOG << "mutable output ...";
+              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                  out_image_width, out_image_height);
+              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto it = wait_list->find(out_ptr);
+              if (it != wait_list->end()) {
+                SHADOW_LOG << "--- Find the sync event for the target cl "
+                              "tensor. ---";
+                auto& event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target "
+                              "cl tensor.";
+              }
+              TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                          output.data<float, cl::Image2D>(),
+                                          out_image_width,
+                                          out_image_height,
+                                          cl_image2d_row_pitch,
+                                          cl_image2d_slice_pitch,
+                                          IoDirection::DtoH);
+              DDim out_image_shape =
+                  default_convertor.InitImageDimInfoWith(output.dims());
+              default_convertor.ImageToNCHW(out_image_v.data(),
+                                            output_v.data(),
+                                            out_image_shape,
+                                            output.dims());
+              SHADOW_LOG << "输出: ----  ";
+              for (int i = 0; i < output_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << output_v[i];
+              }
+              SHADOW_LOG << "输出image: ----  ";
+              for (int i = 0; i < out_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << out_image_v[i];
+              }
+              SHADOW_LOG << "mutable_data out_ref_data: ";
+              // run cpu ref
+              auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+              SHADOW_LOG << " conv_basic beigin ..... ";
+              conv_basic<float, float>(input_v.data(),
+                                       out_ref_data,
+                                       batch_size,
+                                       oc,
+                                       oh,
+                                       ow,
+                                       ic,
+                                       ih,
+                                       iw,
+                                       filter_v.data(),
+                                       bias_v.data(),  // mapped_bias,
+                                       group,
+                                       ksize,
+                                       ksize,
+                                       stride,
+                                       stride,
+                                       dilation,
+                                       dilation,
+                                       pad,
+                                       pad,
+                                       bias_flag,
+                                       relu_flag);
+              SHADOW_LOG << " conv_basic end ..... ";
+              SHADOW_LOG << " out_dim: " << out_dim;
+              const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                  {static_cast<int64_t>(out_image_width),
+                   static_cast<int64_t>(out_image_height)})};
+              for (int i = 0; i < out_dim.production(); i++) {
+                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                  LOG(FATAL) << "error idx:" << i;
+                }
+              }
+#ifdef LOOP_TEST
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
+// #define LOOP_TEST
+TEST(conv2d, compute_image2d_7x7) {
+  // conv infos
+  const int ksize = 7;
+  const int stride = 1;
+  const int pad = 2;
+  const int group = 1;
+  const int dilation = 1;
+//  int loop_cnt = 0;
+#ifdef LOOP_TEST
+  for (int batch_size = 2; batch_size < 4; ++batch_size) {
+    for (int oc = 1; oc < 10; oc += 1) {    // oc
+      for (int ih = 7; ih < 15; ih += 1) {  // ih
+        int iw = ih;
+        for (int ic = 1; ic < 10; ic += 1) {  // ic
+          for (bool bias_flag : {true, false}) {
+            for (std::string relu_flag : {"relu"}) {
+#else
+                const int batch_size = 2;
+                const int oc = 1;
+                const int ih = 7;
+                const int iw = 7;
+                const int ic = 1;
+                const bool bias_flag = false;
+                const std::string relu_flag = "";
+#endif
+              const int oh =
+                  ConvOutputSize(ih, ksize, dilation, pad, pad, stride);
+              const int ow =
+                  ConvOutputSize(iw, ksize, dilation, pad, pad, stride);
+              SHADOW_LOG << "to get kernel ...";
+              auto kernels =
+                  KernelRegistry::Global().Create("conv2d",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(kernels.empty());
+              auto kernel = std::move(kernels.front());
+              SHADOW_LOG << "created conv2d kernel";
+              SHADOW_LOG << "prepare kernel ------";
+              lite::Tensor input, filter, bias, output;
+              operators::ConvParam param;
+              param.x = &input;
+              param.filter = &filter;
+              param.output = &output;
+              if (bias_flag) {
+                param.bias = &bias;
+              }
+              if (relu_flag == "relu") {
+                param.fuse_relu = true;
+              } else if (relu_flag == "None") {
+                param.fuse_relu = false;
+              } else if (relu_flag == "relu6") {
+                param.activation_param.Relu_clipped_coef = 6.f;
+                param.activation_param.has_active = true;
+                param.activation_param.active_type =
+                    lite_api::ActivationType::kRelu6;
+              }
+              std::vector<int> paddings = {pad, pad, pad, pad};
+              std::vector<int> dilations = {dilation, dilation};
+              param.paddings = std::make_shared<std::vector<int>>(paddings);
+              param.dilations = std::make_shared<std::vector<int>>(dilations);
+              param.strides = std::vector<int>{stride, stride};
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+              std::unique_ptr<KernelContext> conv_1x1_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(conv_1x1_context->As<OpenCLContext>()));
+              kernel->SetContext(std::move(conv_1x1_context));
+              const DDim& input_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+              const DDim& filter_dim =
+                  lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
+              const DDim& out_dim =
+                  lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+              // element wise bias
+              const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+              param.x->Resize(input_dim);
+              param.filter->Resize(filter_dim);
+              param.output->Resize(out_dim);
+              if (bias_flag) {
+                param.bias->Resize(bias_dim);
+              }
+              kernel->SetParam(param);
+              size_t input_image_width = iw * ((ic + 3) / 4);
+              size_t input_image_height = ih * batch_size;
+              size_t out_image_width = ow * ((oc + 3) / 4);
+              size_t out_image_height = oh * batch_size;
+              size_t bias_image_width = ow * ((oc + 3) / 4);
+              size_t bias_image_height = oh * batch_size;
+              size_t filter_image_width = ksize * ((ic + 3) / 4);
+              size_t filter_image_height = oc * ksize;
+              const size_t cl_image2d_row_pitch{0};
+              const size_t cl_image2d_slice_pitch{0};
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> gen(-5, 5);
+              std::vector<float> input_v(batch_size * ic * ih * iw);
+              std::vector<float> filter_v(oc * ic * ksize * ksize);
+              std::vector<float> output_v(batch_size * oc * oh * ow);
+              std::vector<float> bias_v(oc);
+              SHADOW_LOG << "gen input and filter ...";
+              for (auto& i : input_v) {
+                i = gen(engine);
+                //                i = 1;
+              }
+              for (auto& f : filter_v) {
+                f = gen(engine);
+                //                f = 1;
+              }
+              LOG(INFO) << "bias: " << bias_flag;
+              LOG(INFO) << "relu: " << relu_flag;
+              LOG(INFO) << "inputdims : " << input_dim;
+              LOG(INFO) << "filterdims: " << filter.dims();
+              LOG(INFO) << "outputdims : " << output.dims();
+              SHADOW_LOG << "after gen input and filter ...";
+              SHADOW_LOG << "input_v.size(): " << input_v.size();
+              SHADOW_LOG << "filter_v.size(): " << filter_v.size();
+              SHADOW_LOG << "output_v.size(): " << output_v.size();
+              SHADOW_LOG << "bias_v.size(): " << bias_v.size();
+              SHADOW_LOG << "input_dim.production(): "
+                         << input_dim.production();
+              SHADOW_LOG << "filter_dim.production(): "
+                         << filter_dim.production();
+              SHADOW_LOG << "out_dim.production(): " << out_dim.production();
+              SHADOW_LOG << "bias_dim.production(): " << bias_dim.production();
+              SHADOW_LOG << "4 * input_image_height * input_image_width: "
+                         << 4 * input_image_height * input_image_width;
+              SHADOW_LOG << "4 * filter_image_width * filter_image_height: "
+                         << 4 * filter_image_width * filter_image_height;
+              CHECK(input_dim.production() == input_v.size());
+              CHECK_LE(input_dim.production(),
+                       4 * input_image_height * input_image_width);
+              CHECK(filter_dim.production() == filter_v.size());
+              CHECK_LE(filter_dim.production(),
+                       4 * filter_image_width * filter_image_height);
+              paddle::lite::CLImageConverterDefault default_convertor;
+              SHADOW_LOG << "set mapped input  ...";
+              std::vector<float> x_image_v(input_image_width *
+                                           input_image_height * 4);  // 4 : RGBA
+              std::vector<float> filter_image_v(
+                  filter_image_width * filter_image_height * 4);  // 4 : RGBA
+              std::vector<float> bias_image_v(
+                  bias_image_width * bias_image_height * 4);  // 4 : RGBA
+              std::vector<float> out_image_v(out_image_width *
+                                             out_image_height * 4);  // 4 : RGBA
+              default_convertor.NCHWToImage(
+                  input_v.data(), x_image_v.data(), input_dim);
+              SHADOW_LOG << "输入: ----  ";
+              for (int i = 0; i < input_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << input_v[i];
+              }
+              SHADOW_LOG << "输入image : ----  ";
+              for (int i = 0; i < x_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << x_image_v[i];
+              }
+              SHADOW_LOG << "set mapped filter  ...";
+              CLImageConverterFolder folder_convertor;
+              folder_convertor.NCHWToImage(
+                  filter_v.data(), filter_image_v.data(), filter_dim);
+              SHADOW_LOG << "卷积核: ----  ";
+              for (int i = 0; i < filter_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_v[i];
+              }
+              SHADOW_LOG << "卷积核image: ----  ";
+              for (int i = 0; i < filter_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << filter_image_v[i];
+              }
+              auto* input_image2d = input.mutable_data<float, cl::Image2D>(
+                  input_image_width, input_image_height, x_image_v.data());
+              // assign filter as target arm
+              filter.Assign<float, lite::DDim, TARGET(kARM)>(filter_v.data(),
+                                                             filter_dim);
+              //              auto* filter_image2d = filter.mutable_data<float,
+              //              cl::Image2D>(
+              //                  filter_image_width,
+              //                  filter_image_height,
+              //                  filter_image_v.data());
+              if (bias_flag) {
+                for (int i = 0; i < bias_dim.production(); ++i) {
+                  bias_v[i] = static_cast<int>(gen(engine));
+                }
+                bias.Assign<float, lite::DDim, TARGET(kARM)>(bias_v.data(),
+                                                             bias_dim);
+                //                CLImageConverterFolder folder_convertor;
+                //                folder_convertor.NCHWToImage(
+                //                    bias_v.data(), bias_image_v.data(),
+                //                    bias_dim);
+                //
+                //                auto* bias_data = bias.mutable_data<float,
+                //                cl::Image2D>(
+                //                    bias_image_width, bias_image_height,
+                //                    bias_image_v.data());
+              }
+              SHADOW_LOG << "resize output  ...";
+              output.Resize(out_dim);
+              // cpu conv basic calc
+              lite::Tensor out_ref;
+              out_ref.Resize(out_dim);
+              SHADOW_LOG << "prepare kernel ready";
+              SHADOW_LOG << "kernel launch ...";
+              kernel->Launch();
+              SHADOW_LOG << "mutable output ...";
+              auto* output_image2d = output.mutable_data<float, cl::Image2D>(
+                  out_image_width, out_image_height);
+              auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+              auto* out_ptr = param.output->data<float, cl::Image2D>();
+              auto it = wait_list->find(out_ptr);
+              if (it != wait_list->end()) {
+                SHADOW_LOG << "--- Find the sync event for the target cl "
+                              "tensor. ---";
+                auto& event = *(it->second);
+                event.wait();
+              } else {
+                LOG(FATAL) << "Could not find the sync event for the target "
+                              "cl tensor.";
+              }
+              TargetWrapperCL::ImgcpySync(out_image_v.data(),
+                                          output.data<float, cl::Image2D>(),
+                                          out_image_width,
+                                          out_image_height,
+                                          cl_image2d_row_pitch,
+                                          cl_image2d_slice_pitch,
+                                          IoDirection::DtoH);
+              DDim out_image_shape =
+                  default_convertor.InitImageDimInfoWith(output.dims());
+              default_convertor.ImageToNCHW(out_image_v.data(),
+                                            output_v.data(),
+                                            out_image_shape,
+                                            output.dims());
+              SHADOW_LOG << "输出: ----  ";
+              for (int i = 0; i < output_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << output_v[i];
+              }
+              SHADOW_LOG << "输出image: ----  ";
+              for (int i = 0; i < out_image_v.size(); i++) {
+                SHADOW_LOG << "(" << i << ")" << out_image_v[i];
+              }
+              SHADOW_LOG << "mutable_data out_ref_data: ";
+              // run cpu ref
+              auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+              SHADOW_LOG << " conv_basic beigin ..... ";
+              conv_basic<float, float>(input_v.data(),
+                                       out_ref_data,
+                                       batch_size,
+                                       oc,
+                                       oh,
+                                       ow,
+                                       ic,
+                                       ih,
+                                       iw,
+                                       filter_v.data(),
+                                       bias_v.data(),  // mapped_bias,
+                                       group,
+                                       ksize,
+                                       ksize,
+                                       stride,
+                                       stride,
+                                       dilation,
+                                       dilation,
+                                       pad,
+                                       pad,
+                                       bias_flag,
+                                       relu_flag);
+              SHADOW_LOG << " conv_basic end ..... ";
+              SHADOW_LOG << " out_dim: " << out_dim;
+              const DDim& out_image_dims = lite::DDim{std::vector<int64_t>(
+                  {static_cast<int64_t>(out_image_width),
+                   static_cast<int64_t>(out_image_height)})};
+              for (int i = 0; i < out_dim.production(); i++) {
+                EXPECT_NEAR(output_v[i], out_ref_data[i], 1e-2);
+                if (abs(output_v[i] - out_ref_data[i]) > 1e-2) {
+                  LOG(FATAL) << "error idx:" << i;
+                }
+              }
+#ifdef LOOP_TEST
+            }
+          }
+        }
+      }
+    }
+  }
+#else
+// nothing to do.
+#endif
+}
+#undef LOOP_TEST
+#undef PRINT_RESULT
+#undef SHADOW_LOG
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(conv2d, kOpenCL, kFloat, kImageDefault, image2d);
--- a/lite/kernels/opencl/elementwise_add_compute.cc
+++ b/lite/kernels/opencl/elementwise_add_compute.cc
@@ -23,6 +23,8 @@ namespace lite {
 namespace kernels {
 namespace opencl {
+/* Buffer */
+#if 0
 void ElementwiseAddCompute::PrepareForRun() {
  auto& context = ctx_->As<OpenCLContext>();
  context.cl_context()->AddKernel(
@@ -92,6 +94,124 @@ void ElementwiseAddCompute::UpdateParams() {
  VLOG(4) << "channels: " << channels_;
  VLOG(4) << "num: " << num_;
 }
+#endif
+/* Image2D */
+void ElementwiseAddImageCompute::PrepareForRun() {
+  ele_param_ = param_.get_mutable<param_t>();
+  auto* x = ele_param_->X;
+  auto* y = ele_param_->Y;
+  auto axis = ele_param_->axis;
+  if (y->dims().size() == 4) {
+    kernel_func_name_ = "elementwise_add";  // y: ImageDefault
+  } else if (y->dims().size() == 1) {
+    if (axis == x->dims().size() - 1) {
+      kernel_func_name_ = "width_add";  // y: ImageDefault
+    } else if (axis == x->dims().size() - 3) {
+      kernel_func_name_ = "channel_add";  // y: ImageFolder
+    } else {
+      LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+                 << ", x->dims().size():" << x->dims().size()
+                 << ", y->dims.size():" << y->dims().size();
+    }
+  } else {
+    LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+               << ", x->dims().size():" << x->dims().size()
+               << ", y->dims.size():" << y->dims().size();
+  }
+  VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
+  auto& context = ctx_->As<OpenCLContext>();
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+}
+void ElementwiseAddImageCompute::Run() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* x = ele_param_->X;
+  auto* y = ele_param_->Y;
+  auto* out = ele_param_->Out;
+  auto axis = ele_param_->axis;
+  VLOG(4) << "x->target():" << TargetToStr(x->target());
+  VLOG(4) << "y->target():" << TargetToStr(y->target());
+  VLOG(4) << "out->target():" << TargetToStr(out->target());
+  VLOG(4) << "x->dims():" << x->dims();
+  VLOG(4) << "y->dims():" << y->dims();
+  VLOG(4) << "out->dims():" << out->dims();
+  VLOG(4) << "axis:" << axis;
+  paddle::lite::CLImageConverterDefault default_convertor;
+  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
+  auto x_img_width = x_img_shape[0];
+  auto x_img_height = x_img_shape[1];
+  auto out_img_shape =
+      default_convertor.InitImageDimInfoWith(out->dims());  // w, h
+  auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
+  auto* x_img = x->data<float, cl::Image2D>();
+  auto* y_img = y->data<float, cl::Image2D>();
+  auto* out_img =
+      out->mutable_data<float, cl::Image2D>(out_img_shape[0], out_img_shape[1]);
+  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
+  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
+  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
+          << out_img_shape[1];
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  int arg_idx = 0;
+  auto y_dims = y->dims();
+  if (y_dims.size() == 4) {
+    cl_int status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+  } else if (y_dims.size() == 1) {
+    if (axis == x->dims().size() - 1 || axis == x->dims().size() - 3) {
+      int tensor_w = x->dims()[x->dims().size() - 1];
+      VLOG(4) << "tensor_w:" << tensor_w;
+      cl_int status = kernel.setArg(arg_idx, *x_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *y_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *out_img);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+      CL_CHECK_FATAL(status);
+    } else {
+      LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+                 << ", x->dims().size():" << x->dims().size()
+                 << ", y->dims.size():" << y->dims().size();
+    }
+  } else {
+    LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
+               << ", x->dims().size():" << x->dims().size()
+               << ", y->dims.size():" << y->dims().size();
+  }
+  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
+                                      static_cast<cl::size_type>(x_img_height)};
+  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_img, event_);
+}
 }  // namespace opencl
 }  // namespace kernels
@@ -99,9 +219,36 @@ void ElementwiseAddCompute::UpdateParams() {
 }  // namespace paddle
 namespace ocl = paddle::lite::kernels::opencl;
-REGISTER_LITE_KERNEL(
-    elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
+// REGISTER_LITE_KERNEL(
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    elementwise_add, kOpenCL, kFloat, kNCHW, ocl::ElementwiseAddCompute, def)
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .Finalize();
+// TODO(ysh329): Not fix.
+// "Y" may from constant value like conv bias (kARM, need do cl_image_converter
+// on CPU);
+//     may from anther branch like "X" (kOpenCL, nothing to do).
+// Consider 2 situations have different actions when pass running(pick kernel),
+//     set target of "Y" as kOpenCL temporarily.
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     ocl::ElementwiseAddImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
    .Finalize();
--- a/lite/kernels/opencl/elementwise_add_compute.h
+++ b/lite/kernels/opencl/elementwise_add_compute.h
@@ -33,6 +33,10 @@ class ElementwiseAddCompute
  void Run() override;
+  std::string doc() const override {
+    return "ElementwiseAdd using cl::Buffer, kFloat";
+  }
 protected:
  void UpdateParams();
@@ -45,6 +49,28 @@ class ElementwiseAddCompute
  std::shared_ptr<cl::Event> event_{new cl::Event};
 };
+class ElementwiseAddImageCompute
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  void PrepareForRun() override;
+  void Run() override;
+  std::string doc() const override {
+    return "ElementwiseAdd using cl::Image2D, kFloat";
+  }
+ protected:
+  param_t* ele_param_{nullptr};
+  std::string kernel_func_name_{"elementwise_add"};
+  std::string build_options_{" -DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/opencl/elementwise_add_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_add_compute_test.cc
@@ -22,6 +22,19 @@
 namespace paddle {
 namespace lite {
+template <typename dtype>
+void fill_data(dtype *x, const int length, int set_value = -1) {
+  if (set_value == -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = idx;
+    }
+  } else if (set_value != -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = set_value;
+    }
+  }
+}
 template <typename dtype>
 void elementwise_compute_ref(const dtype *x_data,
                             const dtype *y_data,
@@ -46,25 +59,17 @@ void elementwise_compute_ref(const dtype *x_data,
  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
    num *= x_dims[i];
  }
+  VLOG(4) << "axis:" << axis;
+  VLOG(4) << "batch:" << batch;
+  VLOG(4) << "cahnnels:" << channels;
+  VLOG(4) << "num:" << num;
  // do elementwise add/sub/max/...
-  if (elt_type == "add") {
+  if (elt_type == "add" && axis == 1 && y_dims.size() == 1) {
-    for (int i = 0; i < batch; ++i) {
+    for (int i = 0; i < x_dims.production(); ++i) {
-      for (int j = 0; j < channels; ++j) {
+      auto w = i % y_dims.production();
-        int offset = (i * channels + j) * num;
+      out_data[i] = x_data[i] + y_data[w];
-        const dtype *din_ptr = x_data + offset;
-        const dtype diny_data = y_data[j];
-        dtype *dout_ptr = out_data + offset;
-        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr + diny_data;
-          if (use_relu) {
-            *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
-          }
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
    }
-  } else if (elt_type == "sub") {
+  } else if (elt_type == "add") {
    for (int i = 0; i < batch; ++i) {
      for (int j = 0; j < channels; ++j) {
        int offset = (i * channels + j) * num;
@@ -72,7 +77,7 @@ void elementwise_compute_ref(const dtype *x_data,
        const dtype diny_data = y_data[j];
        dtype *dout_ptr = out_data + offset;
        for (int k = 0; k < num; ++k) {
-          *dout_ptr = *din_ptr - diny_data;
+          *dout_ptr = *din_ptr + diny_data;
          if (use_relu) {
            *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
          }
@@ -86,7 +91,9 @@ void elementwise_compute_ref(const dtype *x_data,
  }
 }
-TEST(elementwise_add, compute) {
+// buffer
+#if 0
+TEST(elementwise_add_buffer, compute) {
  LOG(INFO) << "to get kernel ...";
  auto kernels = KernelRegistry::Global().Create(
      "elementwise_add", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
@@ -163,7 +170,7 @@ TEST(elementwise_add, compute) {
  TargetWrapperCL::Unmap(out_data, mapped_out);
 }
-TEST(fusion_elementwise_add_activation, compute) {
+TEST(fusion_elementwise_add_activation_buffer, compute) {
  LOG(INFO) << "to get kernel ...";
  auto kernels =
      KernelRegistry::Global().Create("fusion_elementwise_add_activation",
@@ -243,9 +250,204 @@ TEST(fusion_elementwise_add_activation, compute) {
  }
  TargetWrapperCL::Unmap(out_data, mapped_out);
 }
+#endif
+// image
+TEST(elementwise_add_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img on cpu) -> "
+               "elementwise_add(img) -> "
+               "layout(img2buf on cpu) "
+               "-> host";
+  // elementwise_add's 3 kernels selection routing strategy:
+  // --------------------------------------------------------
+  //  1. elementwise_add: Need y_dim.size() == 4
+  //  2. elementwise_add (used by fuse_elementwise_activation op):
+  //                      Need y_dim.size() == 4 && act_type == "relu"
+  //  3. width_add:       Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
+  //  3
+  //  4. channel_add:     Need y_dim.size() == 1 && x_dim.size() == 4 && axis ==
+  //  1
+  // dims
+  const int n = 1;
+  const int c = 3;
+  const int h = 2;
+  const int w = 2;
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+  auto out_dim = x_dim;
+  // y_dim / axis / relu_flag
+  std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{w}),
+                            DDim(std::vector<DDim::value_type>{w})};
+  std::vector<int> axis_v{-1, -1, 3, 1};
+  std::vector<bool> relu_flag_v{false, true, false, false};
+  CHECK(y_dim_v.size() == axis_v.size() && axis_v.size() == relu_flag_v.size())
+      << "y_dim_v.size() == axis_v.size() == relu_flag_v.size() should be "
+         "same, and be corresponding "
+         "one by one";
+  // start loop
+  for (size_t case_idx = 0; case_idx < y_dim_v.size(); ++case_idx) {
+    auto y_dim = y_dim_v[case_idx];
+    auto axis = axis_v[case_idx];
+    auto relu_flag = relu_flag_v[case_idx];
+    LOG(INFO) << "================== elementwise_add, case_idx:" << case_idx + 1
+              << "/" << y_dim_v.size() << " ===================";
+    LOG(INFO) << "x_dim:" << x_dim;
+    LOG(INFO) << "y_dim:" << y_dim;
+    LOG(INFO) << "out_dim:" << out_dim;
+    LOG(INFO) << "axis:" << axis;
+    LOG(INFO) << "relu_flag:" << relu_flag;
+    // tensor
+    VLOG(4) << "set tensors about op param";
+    lite::Tensor eleadd_x, eleadd_y, eleadd_out;
+    eleadd_x.Resize(x_dim);
+    eleadd_y.Resize(y_dim);
+    eleadd_out.Resize(out_dim);
+    // initialize tensors
+    VLOG(4) << "initialize tensors";
+    paddle::lite::CLImageConverterDefault default_convertor;
+    // x
+    std::vector<float> x_v(x_dim.production());
+    fill_data<float>(x_v.data(), x_v.size());  // fill with index value
+    auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim);  // w, h
+    auto x_img_w = x_img_shape[0];
+    auto x_img_h = x_img_shape[1];
+    std::vector<float> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
+    default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
+    eleadd_x.mutable_data<float, cl::Image2D>(x_img_w, x_img_h, x_img_v.data());
+    // y
+    std::vector<float> y_v(y_dim.production());
+    fill_data<float>(y_v.data(), y_v.size());  // fill with index value
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim);  // w, h
+    auto y_img_w = y_img_shape[0];
+    auto y_img_h = y_img_shape[1];
+    std::vector<float> y_img_v(y_img_shape[0] * y_img_shape[1] * 4);  // 4: RGBA
+    default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
+    eleadd_y.mutable_data<float, cl::Image2D>(y_img_w, y_img_h, y_img_v.data());
+    // out
+    auto out_img_shape =
+        default_convertor.InitImageDimInfoWith(out_dim);  // w, h
+    auto out_img_w = out_img_shape[0];
+    auto out_img_h = out_img_shape[1];
+    eleadd_out.mutable_data<float, cl::Image2D>(out_img_w, out_img_h);
+    std::vector<float> out_img_v(out_img_w * out_img_h * 4);
+    fill_data<float>(
+        out_img_v.data(), out_img_v.size(), 0);  // fill with zero value
+    std::vector<float> out_v(out_dim.production());
+    // operator param
+    operators::FusionElementwiseActivationParam
+        fuseEleaddParam;  // enabled if relu_flag is true
+    fuseEleaddParam.X = &eleadd_x;
+    fuseEleaddParam.Y = &eleadd_y;
+    fuseEleaddParam.Out = &eleadd_out;
+    fuseEleaddParam.axis = axis;
+    fuseEleaddParam.act_type = relu_flag ? "relu" : "";
+    operators::ElementwiseParam eleaddParam;
+    eleaddParam.X = &eleadd_x;
+    eleaddParam.Y = &eleadd_y;
+    eleaddParam.Out = &eleadd_out;
+    eleaddParam.axis = axis;
+    auto op_param = relu_flag ? fuseEleaddParam : eleaddParam;
+    // set kernel
+    auto eleadd_img_kernels =
+        KernelRegistry::Global().Create("elementwise_add",
+                                        TARGET(kOpenCL),
+                                        PRECISION(kFloat),
+                                        DATALAYOUT(kImageDefault));
+    ASSERT_FALSE(eleadd_img_kernels.empty());
+    auto eleadd_img_kernel = std::move(eleadd_img_kernels.front());
+    VLOG(4) << "get eleadd kernel: " << eleadd_img_kernel->doc();
+    // set context and kernel args
+    VLOG(4) << "set context and kernel args";
+    std::unique_ptr<KernelContext> context(new KernelContext);
+    context->As<OpenCLContext>().InitOnce();
+    eleadd_img_kernel->SetParam(op_param);
+    std::unique_ptr<KernelContext> eleadd_img_context(new KernelContext);
+    context->As<OpenCLContext>().CopySharedTo(
+        &(eleadd_img_context->As<OpenCLContext>()));
+    eleadd_img_kernel->SetContext(std::move(eleadd_img_context));
+    // run kernel
+    VLOG(4) << "run kernel";
+    eleadd_img_kernel->Launch();
+    // download gpu result to cpu
+    const size_t cl_image2d_row_pitch{0};
+    const size_t cl_image2d_slice_pitch{0};
+    TargetWrapperCL::ImgcpySync(out_img_v.data(),
+                                eleadd_out.data<float, cl::Image2D>(),
+                                out_img_w,
+                                out_img_h,
+                                cl_image2d_row_pitch,
+                                cl_image2d_slice_pitch,
+                                IoDirection::DtoH);
+    default_convertor.ImageToNCHW(
+        out_img_v.data(), out_v.data(), out_img_shape, out_dim);
+    // compute cpu reference
+    std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+    elementwise_compute_ref<float>(x_v.data(),
+                                   y_v.data(),
+                                   out_ref.get(),
+                                   x_dim,
+                                   y_dim,
+                                   op_param.axis,
+                                   "add",
+                                   relu_flag);
+#if 0  // enable to check value of x and y
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                  << out_dim.production() << ", x_v[" << eidx << "]:"
+                  << x_v[eidx] << ", value[" << eidx << "]:" << value
+                  << ", ref_value[" << eidx << "]:" << ref_value;
+    }
+    for (int i = 0; i < y_v.size(); i++) {
+      LOG(INFO) << "y_v[" << i << "]:" << y_v[i];
+    }
+#endif
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+      EXPECT_NEAR(value, ref_value, 1e-6);
+      if (abs(value - ref_value) > 1e-6) {
+        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                  << out_dim.production() << ", value[" << eidx << "]:" << value
+                  << ", ref_value[" << eidx << "]:" << ref_value;
+        break;
+      }
+    }
+  }
+}
 }  // namespace lite
 }  // namespace paddle
-USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fusion_elementwise_add_activation, kOpenCL, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(fusion_elementwise_add_activation, kOpenCL, kFloat, kNCHW,
+// def);
+USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kImageDefault, def);
+USE_LITE_KERNEL(
+    fusion_elementwise_add_activation, kOpenCL, kFloat, kImageDefault, def);
--- a/lite/kernels/opencl/elementwise_mul_compute.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/opencl/elementwise_mul_compute.h"
+#include <memory>
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/op_registry.h"
+#include "lite/utils/replace_stl/stream.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+void ElementwiseMulFloatImageCompute::PrepareForRun() {
+  ele_param_ = param_.get_mutable<param_t>();
+  auto* y = ele_param_->Y;
+  auto y_dims = y->dims();
+  if (y_dims == ele_param_->X->dims()) {
+    kernel_func_name_ = "elementwise_mul";
+  } else if (y_dims.size() == 1) {
+    kernel_func_name_ = "channel_mul_d1";
+  } else if (y_dims.size() == 2) {
+    kernel_func_name_ = "channel_mul_d2";
+  } else if (y_dims.size() == 4) {
+    kernel_func_name_ = "channel_mul_d4";
+  } else {
+    LOG(FATAL) << "ElementwiseMul not supported y_dims.size():" << y_dims.size()
+               << ", x_dims.size():" << ele_param_->X->dims().size();
+  }
+  VLOG(4) << "kernel_func_name_:" << kernel_func_name_;
+  VLOG(4) << "y_dims:" << y_dims;
+  VLOG(4) << "y_dims.size():" << y_dims.size();
+  auto& context = ctx_->As<OpenCLContext>();
+  context.cl_context()->AddKernel(
+      kernel_func_name_, "image/elementwise_mul_kernel.cl", build_options_);
+}
+void ElementwiseMulFloatImageCompute::Run() {
+  auto& context = ctx_->As<OpenCLContext>();
+  CHECK(context.cl_context() != nullptr);
+  auto* x = ele_param_->X;
+  auto* y = ele_param_->Y;
+  auto* out = ele_param_->Out;
+  VLOG(4) << "x->target():" << TargetToStr(x->target());
+  VLOG(4) << "y->target():" << TargetToStr(y->target());
+  VLOG(4) << "out->target():" << TargetToStr(out->target());
+  VLOG(4) << "x->dims():" << x->dims();
+  VLOG(4) << "y->dims():" << y->dims();
+  VLOG(4) << "out->dims():" << out->dims();
+  paddle::lite::CLImageConverterDefault default_convertor;
+  auto x_img_shape = default_convertor.InitImageDimInfoWith(x->dims());  // w, h
+  auto x_img_width = x_img_shape[0];
+  auto x_img_height = x_img_shape[1];
+  auto out_img_shape =
+      default_convertor.InitImageDimInfoWith(out->dims());  // w, h
+  auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
+  auto* x_img = x->data<float, cl::Image2D>();
+  auto* y_img = y->data<float, cl::Image2D>();
+  auto* out_img =
+      out->mutable_data<float, cl::Image2D>(out_img_shape[0], out_img_shape[1]);
+  VLOG(4) << "x_img_shape[w,h]:" << x_img_width << " " << x_img_height;
+  VLOG(4) << "y_img_shape[w,h]:" << y_img_shape[0] << " " << y_img_shape[1];
+  VLOG(4) << "out_img_shape[w,h]:" << out_img_shape[0] << " "
+          << out_img_shape[1];
+  STL::stringstream kernel_key;
+  kernel_key << kernel_func_name_ << build_options_;
+  auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+  int arg_idx = 0;
+  auto y_dims = y->dims();
+  if (y_dims == ele_param_->X->dims()) {
+    // kernel: elementwise_mul(channel_mul_d4)
+    cl_int status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+  } else if (y_dims.size() == 1 || y_dims.size() == 4) {
+    auto tensor_w = x->dims()[x->dims().size() - 1];
+    VLOG(4) << "tensor_w:" << tensor_w;
+    // kernel: channel_mul_d1 / channel_mul_d4
+    cl_int status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(tensor_w));
+    CL_CHECK_FATAL(status);
+  } else if (y_dims.size() == 2) {
+    auto y_tensor_h = y->dims()[0];
+    auto y_tensor_w = y->dims()[1];
+    VLOG(4) << "y_tensor_w:" << y_tensor_w << " y_tensor_h:" << y_tensor_h;
+    // kernel: channel_mul_d2
+    cl_int status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(y_tensor_h));
+    CL_CHECK_FATAL(status);
+  } else {
+    LOG(FATAL) << "ElementwiseMul not supported y_dims.size():"
+               << y_dims.size();
+  }
+  auto global_work_size = cl::NDRange{static_cast<cl::size_type>(x_img_width),
+                                      static_cast<cl::size_type>(x_img_height)};
+  auto status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+      kernel,
+      cl::NullRange,
+      global_work_size,
+      cl::NullRange,
+      nullptr,
+      event_.get());
+  CL_CHECK_FATAL(status);
+  context.cl_wait_list()->emplace(out_img, event_);
+  VLOG(4) << "global_work_size:[2D]:" << x_img_width << " " << x_img_height;
+}
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+namespace ocl = paddle::lite::kernels::opencl;
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     ocl::ElementwiseMulFloatImageCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/elementwise_mul_compute.h
+++ b/lite/kernels/opencl/elementwise_mul_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/core/kernel.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class ElementwiseMulFloatImageCompute
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  std::string doc() const override {
+    return "ElementwiseMul using cl::Image2D(ImageDefault/RGBA), kFP32";
+  }
+  void PrepareForRun() override;
+  void Run() override;
+ protected:
+  param_t* ele_param_{nullptr};
+  std::string kernel_func_name_{"elementwise_mul"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/opencl/elementwise_mul_compute_test.cc
+++ b/lite/kernels/opencl/elementwise_mul_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+template <typename dtype>
+void fill_data(dtype *x, const int length, int set_value = -1) {
+  if (set_value == -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = idx;
+    }
+  } else if (set_value != -1) {
+    for (size_t idx = 0; idx < length; ++idx) {
+      x[idx] = set_value;
+    }
+  }
+}
+template <typename dtype>
+void elementwise_compute_ref(const dtype *x_data,
+                             const dtype *y_data,
+                             dtype *out_data,
+                             const DDim &x_dims,
+                             const DDim &y_dims,
+                             int axis,
+                             const std::string elt_type,
+                             bool use_relu = false) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  if (x_dims == y_dims || y_dims.size() == 2 || y_dims.size() == 1) {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype *din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype *dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr * diny_data;
+          if (use_relu) {
+            *dout_ptr = std::max(*dout_ptr, static_cast<dtype>(0));
+          }
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (y_dims.size() == 4) {
+    // eg: x_dims: [1, 3, 2, 2]
+    //     y_dims: [1, 3, 1, 1]
+    ASSERT_EQ(y_dims[2], y_dims[3]);
+    ASSERT_EQ(y_dims[2], 1);
+    ASSERT_EQ(y_dims[0], 1);
+    auto y_offset = y_dims.production();
+    auto x_offset = x_dims.production() / y_offset;
+    for (auto x = 0; x < x_dims.production(); ++x) {
+      auto y = x / x_offset;
+      out_data[x] = x_data[x] * y_data[y];
+    }
+  } else {
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type << std::endl;
+  }
+}
+// #define PRINT_RESULT
+TEST(elemul_image2d_fp32, compute_kernel_elemenwise_mul) {
+  LOG(INFO)
+      << "main steps of test: host -> layout(buf2img on cpu) -> elemul(img) -> "
+         "layout(img2buf on cpu) "
+         "-> host";
+  // dims
+  const int n = 1;
+  const int c = 3;
+  const int h = 2;
+  const int w = 2;
+  const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+  auto out_dim = x_dim;
+  std::vector<DDim> y_dim_v{DDim(std::vector<DDim::value_type>{n, c, 1, 1}),
+                            DDim(std::vector<DDim::value_type>{n, c, h, w}),
+                            DDim(std::vector<DDim::value_type>{h, w}),
+                            DDim(std::vector<DDim::value_type>{w})};
+  for (auto y_dim : y_dim_v) {
+    LOG(INFO) << "================== elementwise_mul ===================";
+    LOG(INFO) << "x_dim:" << x_dim << "\ty_dim:" << y_dim
+              << "\tout_dim:" << out_dim;
+    // tensor
+    LOG(INFO) << "set tensors about op param";
+    lite::Tensor elemul_x, elemul_y, elemul_out;
+    elemul_x.Resize(x_dim);
+    elemul_y.Resize(y_dim);
+    elemul_out.Resize(out_dim);
+    // initialize tensors
+    VLOG(4) << "initialize tensors";
+    paddle::lite::CLImageConverterDefault default_convertor;
+    // x
+    std::vector<float> x_v(x_dim.production());
+    fill_data<float>(x_v.data(), x_v.size());  // fill with index value
+    auto x_img_shape = default_convertor.InitImageDimInfoWith(x_dim);  // w, h
+    auto x_img_w = x_img_shape[0];
+    auto x_img_h = x_img_shape[1];
+    std::vector<float> x_img_v(x_img_w * x_img_h * 4);  // 4: RGBA
+    default_convertor.NCHWToImage(x_v.data(), x_img_v.data(), x_dim);
+    elemul_x.mutable_data<float, cl::Image2D>(x_img_w, x_img_h, x_img_v.data());
+    // y
+    std::vector<float> y_v(y_dim.production());
+    fill_data<float>(y_v.data(), y_v.size());  // fill with index value
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dim);  // w, h
+    auto y_img_w = y_img_shape[0];
+    auto y_img_h = y_img_shape[1];
+    std::vector<float> y_img_v(y_img_shape[0] * y_img_shape[1] * 4);  // 4: RGBA
+    default_convertor.NCHWToImage(y_v.data(), y_img_v.data(), y_dim);
+    elemul_y.mutable_data<float, cl::Image2D>(y_img_w, y_img_h, y_img_v.data());
+    // out
+    auto out_img_shape =
+        default_convertor.InitImageDimInfoWith(out_dim);  // w, h
+    auto out_img_w = out_img_shape[0];
+    auto out_img_h = out_img_shape[1];
+    elemul_out.mutable_data<float, cl::Image2D>(out_img_w, out_img_h);
+    std::vector<float> out_img_v(out_img_w * out_img_h * 4);
+    fill_data<float>(
+        out_img_v.data(), out_img_v.size(), 0);  // fill with zero value
+    std::vector<float> out_v(out_dim.production());
+    // operator param
+    operators::ElementwiseParam elemulParam;
+    elemulParam.X = &elemul_x;
+    elemulParam.Y = &elemul_y;
+    elemulParam.Out = &elemul_out;
+    elemulParam.axis = -1;
+    // set kernel
+    auto elemul_img_kernels =
+        KernelRegistry::Global().Create("elementwise_mul",
+                                        TARGET(kOpenCL),
+                                        PRECISION(kFloat),
+                                        DATALAYOUT(kImageDefault));
+    ASSERT_FALSE(elemul_img_kernels.empty());
+    auto elemul_img_kernel = std::move(elemul_img_kernels.front());
+    VLOG(4) << "get elemul kernel: " << elemul_img_kernel->doc();
+    // set context and kernel args
+    VLOG(4) << "set context and kernel args";
+    std::unique_ptr<KernelContext> context(new KernelContext);
+    context->As<OpenCLContext>().InitOnce();
+    elemul_img_kernel->SetParam(elemulParam);
+    std::unique_ptr<KernelContext> elemul_img_context(new KernelContext);
+    context->As<OpenCLContext>().CopySharedTo(
+        &(elemul_img_context->As<OpenCLContext>()));
+    elemul_img_kernel->SetContext(std::move(elemul_img_context));
+    // run kernel
+    VLOG(4) << "run kernel";
+    elemul_img_kernel->Launch();
+    // download gpu result to cpu
+    const size_t cl_image2d_row_pitch{0};
+    const size_t cl_image2d_slice_pitch{0};
+    TargetWrapperCL::ImgcpySync(out_img_v.data(),
+                                elemul_out.data<float, cl::Image2D>(),
+                                out_img_w,
+                                out_img_h,
+                                cl_image2d_row_pitch,
+                                cl_image2d_slice_pitch,
+                                IoDirection::DtoH);
+    default_convertor.ImageToNCHW(
+        out_img_v.data(), out_v.data(), out_img_shape, out_dim);
+    // compute cpu reference
+    std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+    elementwise_compute_ref<float>(x_v.data(),
+                                   y_v.data(),
+                                   out_ref.get(),
+                                   x_dim,
+                                   y_dim,
+                                   elemulParam.axis,
+                                   "mul");
+#if 0  // enable to check value of x and y
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                  << out_dim.production() << ", x_v[" << eidx << "]:"
+                  << x_v[eidx] << ", value[" << eidx << "]:" << value
+                  << ", ref_value[" << eidx << "]:" << ref_value;
+    }
+    for (int i = 0; i < y_v.size(); i++) {
+      LOG(INFO) << "y_v[" << i << "]:" << y_v[i];
+    }
+#endif
+    for (int eidx = 0; eidx < out_dim.production(); eidx++) {
+      auto value = out_v[eidx];
+      auto ref_value = out_ref.get()[eidx];
+      EXPECT_NEAR(value, ref_value, 1e-6);
+      if (abs(value - ref_value) > 1e-6) {
+        LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx << " / "
+                  << out_dim.production() << ", value[" << eidx << "]:" << value
+                  << ", ref_value[" << eidx << "]:" << ref_value;
+        break;
+      }
+    }
+  }
+}
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(elementwise_mul, kOpenCL, kFloat, kImageDefault, def);
--- a/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
+++ b/lite/kernels/opencl/fusion_elementwise_add_activation_compute.cc
@@ -20,6 +20,9 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
+/* Buffer */
+#if 0
 class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
 public:
  using param_t = operators::FusionElementwiseActivationParam;
@@ -38,19 +41,60 @@ class FusionElementwiseAddActivationCompute : public ElementwiseAddCompute {
    }
  }
 };
+#endif
+class FusionElementwiseAddActivationImageCompute
+    : public ElementwiseAddImageCompute {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+  void PrepareForRun() override {
+    build_options_ += " -DRELU";
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/elementwise_add_kernel.cl", build_options_);
+    ele_param_ = param_.get_mutable<param_t>();
+    auto act_t = static_cast<param_t*>(ele_param_)->act_type;
+    VLOG(4) << "act: " << act_t;
+    if (act_t != "relu") {
+      LOG(FATAL) << "Unsupported Activation type: " << act_t;
+    }
+  }
+};
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 namespace ocl = paddle::lite::kernels::opencl;
+// REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+//                     kOpenCL,
+//                     kFloat,
+//                     kNCHW,
+//                     ocl::FusionElementwiseAddActivationCompute,
+//                     def)
+//    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+//    .Finalize();
 REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
                     kOpenCL,
                     kFloat,
-                     kNCHW,
+                     kImageDefault,
-                     ocl::FusionElementwiseAddActivationCompute,
+                     ocl::FusionElementwiseAddActivationImageCompute,
                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("X",
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
    .Finalize();
--- a/lite/kernels/opencl/nearest_interp_compute.cc
+++ b/lite/kernels/opencl/nearest_interp_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+class NearestInterpComputeFloatImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::InterpolateParam;
+  std::string doc() const override {
+    return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFloat";
+  }
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
+  }
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf = param.X->data<float, cl::Image2D>();
+    auto* out_buf =
+        param.Out->mutable_data<float, cl::Image2D>(param.out_w, param.out_h);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+    float scale_h = y_dims[2] / x_dims[2];
+    float scale_w = y_dims[3] / x_dims[3];
+    int in_dims_h = x_dims[2];
+    int out_dims_h = y_dims[2];
+    int in_dims_w = x_dims[3];
+    int out_dims_w = y_dims[3];
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
+    CL_CHECK_FATAL(status);
+    paddle::lite::CLImageConverterDefault default_convertor;
+    auto y_img_shape = default_convertor.InitImageDimInfoWith(y_dims);  // w, h
+    auto y_img_width = y_img_shape[0];
+    LOG(INFO) << "y_img_width:" << y_img_width;
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(y_img_width / y_dims[3]),
+                    static_cast<cl::size_type>(y_dims[3]),
+                    static_cast<cl::size_type>(y_dims[0] * y_dims[2])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+ private:
+  std::string kernel_func_name_{"nearest_interp"};
+  std::string build_options_{"-DCL_DTYPE_float "};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+class NearestInterpComputeFP16ImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::InterpolateParam;
+  std::string doc() const override {
+    return "NearestInterp using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/nearest_interp_kernel.cl", build_options_);
+  }
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf =
+        param.X->data<int16_t,
+                      cl::Image2D>();  // use int16_t represents half float
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf =
+        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
+                                                        // represents half float
+            image_shape["width"],
+            image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+    float scale_h = y_dims[2] / x_dims[2];
+    float scale_w = y_dims[3] / x_dims[3];
+    int in_dims_h = x_dims[2];
+    int out_dims_h = y_dims[2];
+    int in_dims_w = x_dims[3];
+    int out_dims_w = y_dims[3];
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const float>(scale_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_h));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims_w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims_w));
+    CL_CHECK_FATAL(status);
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+ private:
+  std::string kernel_func_name_{"nearest_interp"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(
+    nearest_interp,
+    kOpenCL,
+    kFloat,
+    kImageDefault,
+    paddle::lite::kernels::opencl::NearestInterpComputeFloatImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    nearest_interp,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::NearestInterpComputeFP16ImageDefault,
+    ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
--- a/lite/kernels/opencl/nearest_interp_compute_test.cc
+++ b/lite/kernels/opencl/nearest_interp_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
+namespace paddle {
+namespace lite {
+template <typename dtype>
+void nearest_interp_compute_ref(const dtype *src,
+                                int w_in,
+                                int h_in,
+                                dtype *dst,
+                                int w_out,
+                                int h_out,
+                                float scale_x,
+                                float scale_y,
+                                bool with_align = false) {
+  float scale_w_new = (with_align)
+                          ? (static_cast<float>(w_in - 1) / (w_out - 1))
+                          : (static_cast<float>(w_in) / (w_out));
+  float scale_h_new = (with_align)
+                          ? (static_cast<float>(h_in - 1) / (h_out - 1))
+                          : (static_cast<float>(h_in) / (h_out));
+  if (with_align) {
+    for (int h = 0; h < h_out; ++h) {
+      dtype *dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h + 0.5);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w + 0.5);
+        *dst_p++ = src[near_y * w_in + near_x];
+      }
+    }
+  } else {
+    for (int h = 0; h < h_out; ++h) {
+      dtype *dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w);
+        *dst_p++ = src[near_y * w_in + near_x];
+      }
+    }
+  }
+}
+// #define LOOP_TEST
+// #define PRINT_RESULT
+TEST(nearest_interp_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> "
+               "nearest_interp(img) -> "
+               "layout(img2buf) "
+               "-> host";
+#ifdef LOOP_TEST
+  for (int n : {1, 3}) {
+    for (auto c : {1, 3}) {
+      for (int h : {12, 20, 50, 112}) {
+        for (int w : {12, 20, 50, 112}) {
+          for (int out_h : {36, 60, 90, 224}) {
+            for (int out_w : {36, 60, 90, 224}) {
+              if (out_w < w || out_h < h) {
+                continue;
+              }
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+  const int out_h = 6;
+  const int out_w = 8;
+#endif  // LOOP_TEST
+              float scale_x = out_w / w;
+              float scale_y = out_h / h;
+              LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c
+                        << " " << h << " " << w << " ========" << out_h << " "
+                        << out_w;
+              // set layout kernels
+              auto buf_to_img_kernels =
+                  KernelRegistry::Global().Create("layout",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kAny),
+                                                  DATALAYOUT(kImageDefault));
+              auto img_to_buf_kernels =
+                  KernelRegistry::Global().Create("layout",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kAny),
+                                                  DATALAYOUT(kNCHW));
+              auto nearest_interp_img_kernels =
+                  KernelRegistry::Global().Create("nearest_interp",
+                                                  TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault));
+              ASSERT_FALSE(buf_to_img_kernels.empty());
+              ASSERT_FALSE(buf_to_img_kernels.empty());
+              ASSERT_FALSE(nearest_interp_img_kernels.empty());
+              auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+              auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+              auto nearest_interp_img_kernel =
+                  std::move(nearest_interp_img_kernels.front());
+              LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+              LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+              LOG(INFO) << "get 3rd kernel: "
+                        << nearest_interp_img_kernel->doc();
+              // set tensors about op param
+              LOG(INFO) << "set tensors about op param";
+              // layout(buf->img): x -> nearest_interp_in
+              // nearest_interp(img): nearest_interp_in -> nearest_interp_out
+              // layout(img->buf): nearest_interp_out -> y
+              lite::Tensor x, y, nearest_interp_in, nearest_interp_out, y_ref;
+              operators::LayoutParam BufferToImageParam;
+              operators::LayoutParam ImageToBufferParam;
+              BufferToImageParam.x = &x;
+              BufferToImageParam.y = &nearest_interp_in;
+              ImageToBufferParam.x = &nearest_interp_out;
+              ImageToBufferParam.y = &y;
+              operators::InterpolateParam NearestInterpParam;
+              NearestInterpParam.X = &nearest_interp_in;
+              NearestInterpParam.Out = &nearest_interp_out;
+              NearestInterpParam.out_h = out_h;
+              NearestInterpParam.out_w = out_w;
+              const DDim x_dim =
+                  DDim(std::vector<DDim::value_type>{n, c, h, w});
+              const DDim y_dim =
+                  DDim(std::vector<DDim::value_type>{n, c, out_h, out_w});
+              x.Resize(x_dim);
+              y.Resize(y_dim);
+              nearest_interp_in.Resize(x_dim);
+              nearest_interp_out.Resize(y_dim);
+              y_ref.Resize(y_dim);
+              auto nearest_interp_image2d_shape =
+                  paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+              // initialize tensors
+              LOG(INFO) << "initialize tensors";
+              auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+              auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+              auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+              auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+                  x_data, 0, sizeof(float) * x_dim.production()));
+              auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+                  y_data, 0, sizeof(float) * y_dim.production()));
+              for (int i = 0; i < x_dim.production(); ++i) {
+                mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+              }
+              for (int i = 0; i < y_dim.production(); ++i) {
+                mapped_y[i] = static_cast<int>(0);
+              }
+              auto *nearest_interp_in_data =
+                  nearest_interp_in.mutable_data<float, cl::Image2D>(
+                      nearest_interp_image2d_shape["width"],
+                      nearest_interp_image2d_shape["height"]);
+              auto *nearest_interp_out_data =
+                  nearest_interp_out.mutable_data<float, cl::Image2D>(y_dim[3],
+                                                                      y_dim[2]);
+              // set context and kernel args
+              LOG(INFO) << "set context and kernel args";
+              std::unique_ptr<KernelContext> context(new KernelContext);
+              context->As<OpenCLContext>().InitOnce();
+              buf_to_img_kernel->SetParam(BufferToImageParam);
+              std::unique_ptr<KernelContext> buf_to_img_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(buf_to_img_context->As<OpenCLContext>()));
+              buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+              img_to_buf_kernel->SetParam(ImageToBufferParam);
+              std::unique_ptr<KernelContext> img_to_buf_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(img_to_buf_context->As<OpenCLContext>()));
+              img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+              nearest_interp_img_kernel->SetParam(NearestInterpParam);
+              std::unique_ptr<KernelContext> nearest_interp_img_context(
+                  new KernelContext);
+              context->As<OpenCLContext>().CopySharedTo(
+                  &(nearest_interp_img_context->As<OpenCLContext>()));
+              nearest_interp_img_kernel->SetContext(
+                  std::move(nearest_interp_img_context));
+              // run kernels
+              LOG(INFO) << "run kernel: buf_to_img_kernel";
+              buf_to_img_kernel->Launch();
+              LOG(INFO) << "run kernel: nearest_interp_img_kernel";
+              nearest_interp_img_kernel->Launch();
+              LOG(INFO) << "run kernel: img_to_buf_kernel";
+              img_to_buf_kernel->Launch();
+              // compute ref cpu
+              for (int nid = 0; nid < x_dim[0]; ++nid) {
+                for (int cid = 0; cid < x_dim[1]; ++cid) {
+                  float *x_nc =
+                      mapped_x + (nid * x_dim[1] + cid) * x_dim[3] * x_dim[2];
+                  float *y_nc =
+                      y_data_ref + (nid * x_dim[1] + cid) * y_dim[3] * y_dim[2];
+                  nearest_interp_compute_ref<float>(x_nc,
+                                                    x_dim[3],
+                                                    x_dim[2],
+                                                    y_nc,
+                                                    y_dim[3],
+                                                    y_dim[2],
+                                                    1 / scale_x,
+                                                    1 / scale_y);
+                }
+              }
+// result
+#ifdef PRINT_RESULT
+              LOG(INFO) << "---- print kernel result (input -> output) ----";
+              for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+                std::cout << mapped_x[eidx] << " ";
+              }
+              std::cout << std::endl;
+              for (int eidx = 0; eidx < y_dim.production(); ++eidx) {
+                std::cout << mapped_y[eidx] << " ";
+              }
+              std::cout << std::endl;
+              for (int eidx = 0; eidx < y_dim.production(); ++eidx) {
+                std::cout << y_data_ref[eidx] << " ";
+              }
+              std::cout << std::endl;
+#endif  // PRINT_RESULT
+              // check result: compare kernel output and cpu output(y_data_ref)
+              for (int eidx = 0; eidx < y_dim.production(); eidx++) {
+                EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+                if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+                  LOG(FATAL) << "1st diff in this case at eidx[from 0]:" << eidx
+                             << " / " << x_dim.production() << ", y_data_ref["
+                             << eidx << "]:" << y_data_ref[eidx]
+                             << ", mapped_y[" << eidx << "]:" << mapped_y[eidx];
+                  break;
+                }
+              }
+              // free
+              LOG(INFO) << "free: unmap x, y";
+              TargetWrapperCL::Unmap(x_data, mapped_x);
+              TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+            }
+          }
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+}  // namespace lite
+}  // namespace paddle
+// nearest_interp buffer
+// USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kNCHW, def);
+// nearest_interp image2d fp32
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+USE_LITE_KERNEL(nearest_interp, kOpenCL, kFloat, kImageDefault, ImageDefault);
+// nearest_interp image2d fp16
+USE_LITE_KERNEL(nearest_interp, kOpenCL, kFP16, kImageDefault, ImageDefault);
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -100,3 +100,4 @@ lite_cc_test(test_sequence_concat_compute_x86 SRCS sequence_concat_compute_test.
 lite_cc_test(test_var_conv_2d_compute_x86 SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_x86)
 #lite_cc_test(test_attention_padding_mask_compute_x86 SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_x86)
 lite_cc_test(test_sequence_arithmetic_compute_x86 SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_x86)
+lite_cc_test(test_leaky_relu_compute_x86 SRCS leaky_relu_compute_test.cc DEPS activation_compute_x86)
--- a/lite/kernels/x86/activation_compute.cc
+++ b/lite/kernels/x86/activation_compute.cc
@@ -36,6 +36,17 @@ REGISTER_LITE_KERNEL(relu,
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
    .Finalize();
+// float
+REGISTER_LITE_KERNEL(leaky_relu,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::LeakyReluCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
 // float
 REGISTER_LITE_KERNEL(tanh,
                     kX86,

--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
@@ -117,6 +117,40 @@ class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  virtual ~ReluCompute() = default;
 };
+template <typename T>
+struct LeakyReluFunctor {
+  float alpha;
+  explicit LeakyReluFunctor(float alpha_) : alpha(alpha_) {}
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+  }
+};
+template <typename T>
+class LeakyReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+    param.Out->template mutable_data<T>();
+    auto X = param.X;
+    auto Out = param.Out;
+    auto place = lite::fluid::EigenDeviceType<TARGET(kX86)>();
+    CHECK(X);
+    CHECK(Out);
+    auto x = lite::fluid::EigenVector<T>::Flatten(*X);
+    auto out = lite::fluid::EigenVector<T>::Flatten(*Out);
+    LeakyReluFunctor<T> functor(param.Leaky_relu_alpha);
+    functor(place, x, out);
+  }
+  virtual ~LeakyReluCompute() = default;
+};
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
 struct TanhFunctor : public BaseActivationFunctor<T> {

--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
@@ -131,7 +131,6 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto* w = param.w;
    auto* bias = param.bias;
    auto* output = param.output;
-    int in_num_col_dims = param.in_num_col_dims;
    bool with_relu = (param.activation_type == "relu") ? true : false;
    bool padding_weights = param.padding_weights;
@@ -139,17 +138,7 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
    auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
-    DDim out_dims;
+    int M = output->dims().production() / w_dims1;
-    out_dims.resize(static_cast<size_t>(in_num_col_dims + 1));
-    const auto& in_dims = input->dims();
-    for (int i = 0; i < in_num_col_dims; ++i) {
-      out_dims[i] = in_dims[i];
-    }
-    out_dims[in_num_col_dims] = w_dims1;
-    output->Resize(out_dims);
-    output->set_lod(input->lod());
-    int M = out_dims.production() / w_dims1;
    const T* input_data = input->data<T>();
    const T* w_data = w->data<T>();

--- a/lite/kernels/x86/layer_norm_compute_test.cc
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
@@ -155,7 +155,6 @@ TEST(layer_norm_x86, run_test) {
      ref(&x, &Scale, &Bias, &out, &Mean, &Var, begin_norm_axis, epsilon);
  for (int j = 0; j < out.dims().production(); ++j) {
    EXPECT_NEAR(out_data[j], ref_data[j], 1e-5);
-    // LOG(INFO) << out_data[j];
  }
  LOG(INFO) << *mean_data;
  LOG(INFO) << *var_data;

--- a/lite/kernels/x86/leaky_relu_compute_test.cc
+++ b/lite/kernels/x86/leaky_relu_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/x86/activation_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+TEST(leaky_relu_x86, retrive_op) {
+  auto leaky_relu =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "leaky_relu");
+  ASSERT_FALSE(leaky_relu.empty());
+  ASSERT_TRUE(leaky_relu.front());
+}
+TEST(leaky_relu_x86, init) {
+  LeakyReluCompute<float> leaky_relu;
+  ASSERT_EQ(leaky_relu.precision(), PRECISION(kFloat));
+  ASSERT_EQ(leaky_relu.target(), TARGET(kX86));
+}
+TEST(leaky_relu_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i) / 12.0 - 0.5;
+  }
+  LeakyReluCompute<float> leaky_relu;
+  operators::ActivationParam param;
+  param.X = &x;
+  param.Out = &out;
+  param.Leaky_relu_alpha = 0.05;
+  leaky_relu.SetParam(param);
+  leaky_relu.Run();
+  std::vector<float> ref_data({-0.025,
+                               -0.02083333,
+                               -0.01666667,
+                               -0.0125,
+                               -0.00833333,
+                               -0.00416667,
+                               0.,
+                               0.08333334,
+                               0.16666667,
+                               0.25,
+                               0.33333334,
+                               0.41666666});
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-05);
+  }
+}
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(leaky_relu, kX86, kFloat, kNCHW, def);
--- a/lite/kernels/x86/reshape_compute.h
+++ b/lite/kernels/x86/reshape_compute.h
@@ -28,6 +28,8 @@ namespace x86 {
 template <typename T>
 void Compute(const lite::Tensor* in, lite::Tensor* out) {
+  // In CopyDataFrom, the target tensor's dims will be set to the source
+  // tensor's dims.
  auto out_dims = out->dims();
  out->CopyDataFrom(*in);
  out->Resize(out_dims);

--- a/lite/kernels/xpu/bridges/cast_op.cc
+++ b/lite/kernels/xpu/bridges/cast_op.cc
@@ -78,7 +78,11 @@ int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  if (graph->Has(x_name)) {
    x_node = graph->Get(x_name);
  } else {
-    x_node = graph->Add(x_name, *x, in_ptype);
+    CHECK(x->precision() == in_ptype)
+        << "The data type of input tensor X should be "
+        << PrecisionToStr(in_ptype) << ", but received "
+        << PrecisionToStr(x->precision());
+    x_node = graph->Add(x_name, *x);
  }
  // Cast node

--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -45,6 +45,7 @@ int SizeOfType(framework::proto::VarType::Type type) {
    DO(FP16, float);
    DO(FP32, float);
    DO(INT8, int8_t);
+    DO(INT16, int16_t);
    DO(INT32, int);
    DO(INT64, int64_t);
 #undef DO

--- a/lite/model_parser/naive_buffer/param_desc.cc
+++ b/lite/model_parser/naive_buffer/param_desc.cc
@@ -152,10 +152,8 @@ void ParamDesc::SetDim(const std::vector<int64_t>& dim) {
    auto& data_builder = desc_->GetField<PrimaryListBuilder<char>>("data"); \
    auto data = data_builder.data();                                        \
    size_t size = data_builder.size() / sizeof(T);                          \
-    auto* data_ptr = reinterpret_cast<const T*>(data);                      \
+    res.resize(size);                                                       \
-    for (size_t i = 0; i < size; ++i) {                                     \
+    memcpy(&res[0], data, data_builder.size());                             \
-      res.push_back(data_ptr[i]);                                           \
-    }                                                                       \
    return res;                                                             \
  }

--- a/lite/operators/concat_op.cc
+++ b/lite/operators/concat_op.cc
@@ -27,7 +27,7 @@ bool ConcatOpLite::CheckShape() const {
 }
 bool ConcatOpLite::InferShape() const {
-  std::vector<lite::Tensor *> &inputs = param_.x;
+  const std::vector<Tensor *> &inputs = param_.x;
  const size_t n = inputs.size();
  CHECK_GT_OR_FALSE(n, 0);
@@ -45,7 +45,7 @@ bool ConcatOpLite::InferShape() const {
  auto out_dims = inputs[0]->dims();
  size_t in_zero_dims_size = out_dims.size();
  for (size_t i = 1; i < n; i++) {
-    auto &input_dims_i = inputs[i]->dims();
+    const auto &input_dims_i = inputs[i]->dims();
    for (size_t j = 0; j < in_zero_dims_size; j++) {
      if (j == static_cast<size_t>(axis)) {
        out_dims[axis] += input_dims_i[j];

--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -49,17 +49,17 @@ bool FcOpLite::CheckShape() const {
 }
 bool FcOpLite::InferShape() const {
-  const auto &input_dims = param_.input->dims();
+  const auto& input_dims = param_.input->dims();
-  const auto &w_dims = param_.w->dims();
+  const auto& w_dims = param_.w->dims();
  int in_num_col_dims = param_.in_num_col_dims;
+  int64_t w_dims_1 = param_.padding_weights ? w_dims[1] - 4 : w_dims[1];
  // Set output dims
-  DDim output_dims;
+  std::vector<DDim::value_type> output_dims(in_num_col_dims + 1);
-  output_dims.resize(in_num_col_dims + 1);
  for (int i = 0; i < in_num_col_dims; ++i) {
    output_dims[i] = input_dims[i];
  }
-  output_dims[in_num_col_dims] = w_dims[1];
+  output_dims[in_num_col_dims] = w_dims_1;
  param_.output->Resize(output_dims);
  // share LoD
@@ -67,7 +67,7 @@ bool FcOpLite::InferShape() const {
  return true;
 }
-bool FcOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+bool FcOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
  auto input = op_desc.Input("Input").front();
  auto W = op_desc.Input("W").front();
  auto out = op_desc.Output("Out").front();

--- a/lite/operators/gru_op.cc
+++ b/lite/operators/gru_op.cc
@@ -28,8 +28,8 @@ bool GRUOpLite::CheckShape() const {
  CHECK_OR_FALSE(param_.batch_hidden)
  CHECK_OR_FALSE(param_.hidden)
-  const auto &input_dims = param_.input->dims();
+  const auto& input_dims = param_.input->dims();
-  const auto &weight_dims = param_.weight->dims();
+  const auto& weight_dims = param_.weight->dims();
  int input_size = input_dims[1];
  int frame_size = weight_dims[0];
  CHECK_EQ_OR_FALSE(input_size, frame_size * 3)
@@ -52,8 +52,8 @@ bool GRUOpLite::CheckShape() const {
 }
 bool GRUOpLite::InferShape() const {
-  const auto &input_dims = param_.input->dims();
+  const auto& input_dims = param_.input->dims();
-  const auto &weight_dims = param_.weight->dims();
+  const auto& weight_dims = param_.weight->dims();
  int frame_size = weight_dims[0];
  auto batch_size = input_dims[0];
@@ -68,7 +68,7 @@ bool GRUOpLite::InferShape() const {
  return true;
 }
-bool GRUOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+bool GRUOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
  auto input = op_desc.Input("Input").front();
  auto weight = op_desc.Input("Weight").front();
  auto batch_gate = op_desc.Output("BatchGate").front();

--- a/lite/operators/lookup_table_op.cc
+++ b/lite/operators/lookup_table_op.cc
@@ -25,8 +25,8 @@ bool LookupTableOpLite::CheckShape() const {
  CHECK_OR_FALSE(param_.Ids)
  CHECK_OR_FALSE(param_.Out)
-  const auto &table_dims = param_.W->dims();
+  const auto& table_dims = param_.W->dims();
-  const auto &ids_dims = param_.Ids->dims();
+  const auto& ids_dims = param_.Ids->dims();
  int ids_rank = ids_dims.size();
@@ -37,23 +37,20 @@ bool LookupTableOpLite::CheckShape() const {
 }
 bool LookupTableOpLite::InferShape() const {
-  const auto &table_dims = param_.W->dims();
+  const auto& table_dims = param_.W->dims();
-  //  LOG(INFO) << "table_dims: " << table_dims;
+  const auto& ids_dims = param_.Ids->dims();
-  const auto &ids_dims = param_.Ids->dims();
-  //  LOG(INFO) << "ids_dims: " << ids_dims;
  auto out_dims = ids_dims;
  int ids_rank = ids_dims.size();
  out_dims[ids_rank - 1] = table_dims[1];
-  //  LOG(INFO) << "out_dims: " << out_dims;
  param_.Out->Resize(out_dims);
  param_.Out->set_lod(param_.Ids->lod());
  return true;
 }
-bool LookupTableOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+bool LookupTableOpLite::AttachImpl(const cpp::OpDesc& op_desc,
-                                   lite::Scope *scope) {
+                                   lite::Scope* scope) {
  auto input = op_desc.Input("W").front();
  auto ids = op_desc.Input("Ids").front();
  auto out = op_desc.Output("Out").front();

--- a/lite/operators/reduce_ops.cc
+++ b/lite/operators/reduce_ops.cc
@@ -49,8 +49,7 @@ bool ReduceOp::InferShape() const {
      param_.output->Resize(std::vector<int64_t>{1});
  } else {
    size_t out_rank = keep_dim ? x_rank : x_rank - dims.size();
-    DDim out_dims;
+    std::vector<DDim::value_type> out_dims(out_rank);
-    out_dims.resize(out_rank);
    if (keep_dim) {
      for (size_t i = 0; i < dims.size(); ++i) {
        out_dims[dims[i]] = 1;

--- a/lite/operators/reshape_op.cc
+++ b/lite/operators/reshape_op.cc
@@ -34,7 +34,7 @@ bool ReshapeOp::InferShape() const {
  std::vector<int> final_shape;
  if (shape_tensor_vct.size() > 0) {
    final_shape.resize(shape_tensor_vct.size());
-    for (int i = 0; i < shape_tensor_vct.size(); i++) {
+    for (size_t i = 0; i < shape_tensor_vct.size(); i++) {
      final_shape[i] = shape_tensor_vct[i]->data<int>()[0];
    }
  } else if (shape_tensor != nullptr) {
@@ -100,8 +100,7 @@ bool Reshape2Op::CheckShape() const {
 bool Reshape2Op::InferShape() const {
  ReshapeOp::InferShape();
  const auto &x_dims = param_.x->dims();
-  DDim xshape_dims;
+  std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1);
-  xshape_dims.resize(x_dims.size() + 1);
  xshape_dims[0] = 0;
  for (size_t i = 0; i < x_dims.size(); i++) {
    xshape_dims[i + 1] = x_dims[i];
@@ -119,16 +118,24 @@ bool Reshape2Op::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
  return true;
 }
-DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims) {
+static bool CheckPositive(const DDim &dims) {
+  for (size_t i = 0; i < dims.size(); ++i) {
+    if (dims[i] <= 0) {
+      return false;
+    }
+  }
+  return true;
+}
+std::vector<DDim::value_type> ValidateShape(const std::vector<int> &shape,
+                                            const DDim &input_dims) {
  const DDim::value_type input_size = input_dims.production();
-  // Only one dimension can be set to -1, whose size will be automatically
+  // only one dimension can be set to -1, whose size will be automatically
  // infered.
  const int unk_dim_val = -1;
  const int copy_dim_val = 0;
-  DDim output_dims;
-  output_dims.resize(shape.size());
  DDim::value_type capacity = 1;
  int unk_dim_idx = -1;
  for (size_t i = 0; i < shape.size(); ++i) {
@@ -152,7 +159,7 @@ DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims) {
  }
  if (unk_dim_idx != -1) {
-    if (input_dims.CheckPositive()) {
+    if (CheckPositive(input_dims)) {
      // input_size < 0 and is un-determinate in compile time, skip the check,
      // for example, input_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
      // capacity = -24, input_size = -8, output_dims[0] = 0

--- a/lite/operators/reshape_op.h
+++ b/lite/operators/reshape_op.h
@@ -56,7 +56,8 @@ class Reshape2Op : public ReshapeOp {
  std::string DebugString() const override { return "reshape2"; }
 };
-DDim ValidateShape(const std::vector<int> &shape, const DDim &input_dims);
+std::vector<DDim::value_type> ValidateShape(const std::vector<int> &shape,
+                                            const DDim &input_dims);
 }  // namespace operators
 }  // namespace lite

--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -306,8 +306,8 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
                    const float leakey_relu_scale) {}
 #endif  // LITE_WITH_ARM
-// TODO(chenjiaoAngel): fix me, diff: 3x3 depthwise conv
+// TODO(chenjiaoAngel): fix multi-threds, diff: 3x3 depthwise conv
-#if 0   /// 3x3dw
+#if 1  /// 3x3dw
 TEST(TestConv3x3DW, test_conv3x3_depthwise) {
  if (FLAGS_basic_test) {
    for (auto& stride : {1, 2}) {
@@ -334,7 +334,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
                                   {1, 1},
                                   flag_bias,
                                   flag_act,
-                                   {1, 2, 4},
+                                   {1},
                                   {FLAGS_power_mode},
                                   leakey_relu_scale);
                  }
@@ -352,12 +352,7 @@ TEST(TestConv3x3DW, test_conv3x3_depthwise) {
 #if 1  /// 5x5dw
 TEST(TestConv5x5DW, test_conv5x5_depthwise) {
  if (FLAGS_basic_test) {
-#ifdef __aarch64__
-    // TODO(chenjiaoAngel): fix me, diff: arm64 5x5s2 depthwise conv
-    for (auto& stride : {1}) {
-#else
    for (auto& stride : {1, 2}) {
-#endif
      for (auto& pad_left : {0, 1, 2}) {
        for (auto& pad_right : {0, 1, 2}) {
          for (auto& pad_top : {0, 1, 2}) {