diff --git a/docs/faq.md b/docs/faq.md
index bb66e9a436301b36274388042dc2ec72ba6dcf8c..d0f8b3953412f1378acdff18f067eaaeadc237b3 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -46,7 +46,7 @@ due to high memory usage or fragmentation. Several solutions can be tried:
 
 Why is the performance worse than the official result for the same model?
 -------------------------------------------------------------------------
-The power options may not set properly, see `mace/public/mace_runtime.h` for
+The power options may not set properly, see `mace/public/mace.h` for
 details.
 
 Why is the UI getting poor responsiveness when running model with GPU runtime?
@@ -64,4 +64,4 @@ Running models on Hexagon DSP need a few prerequisites for DSP developers:
 * You need sign your phone by using testsig provided by Qualcomm. (Download Qualcomm Hexagon SDK first, plugin your phone to PC, run scripts/testsig.py)
 * You need install Hexagon nnlib backend by following nnlib README (https://github.com/XiaoMi/nnlib).
 
-Then, there you go. You can run Mace on Hexagon DSP.
\ No newline at end of file
+Then, there you go. You can run Mace on Hexagon DSP.
diff --git a/docs/user_guide/basic_usage.rst b/docs/user_guide/basic_usage.rst
index eb067c49a5b42fc2f33d8a614da6f067eb5452ff..15a4d5168ebebef494156184991c4640624d227c 100644
--- a/docs/user_guide/basic_usage.rst
+++ b/docs/user_guide/basic_usage.rst
@@ -299,8 +299,7 @@ header files.
     ├── include
     │   └── mace
     │       └── public
-    │           ├── mace.h
-    │           └── mace_runtime.h
+    │           └── mace.h
     ├── lib
     │   ├── arm64-v8a
     │   │   └── cpu_gpu
diff --git a/mace/python/tools/converter_tool/caffe_converter.py b/mace/python/tools/converter_tool/caffe_converter.py
index a2e89a6d45b1fc397e5e888d0368676d28234824..8ce0a0741d6b791aa16c4137b297521280e93109 100644
--- a/mace/python/tools/converter_tool/caffe_converter.py
+++ b/mace/python/tools/converter_tool/caffe_converter.py
@@ -186,6 +186,7 @@ class CaffeConverter(base_converter.ConverterInterface):
             'InnerProduct': self.convert_fully_connected,
             'BatchNorm': self.convert_folded_batchnorm,
             'Crop': self.convert_crop,
+            'Scale': self.convert_scale,
         }
         self._option = option
         self._mace_net_def = mace_pb2.NetDef()
@@ -604,3 +605,49 @@ class CaffeConverter(base_converter.ConverterInterface):
                             mace_pb2.DT_FLOAT,
                             bias_data)
             op.input.extend([bias_tensor_name])
+
+    def convert_scale(self, caffe_op):
+        op = self.convert_general_op(caffe_op)
+        op.type = MaceOp.Eltwise.name
+
+        scale_op_name = op.name
+        op.name = scale_op_name + '_prod'
+
+        type_arg = op.arg.add()
+        type_arg.name = MaceKeyword.mace_element_type_str
+        type_arg.i = EltwiseType.PROD.value
+
+        scale_tensor_name = scale_op_name + '_scale'
+        scale_data = caffe_op.blobs[0]
+        self.add_tensor(scale_tensor_name, scale_data.shape,
+                        mace_pb2.DT_FLOAT, scale_data)
+        op.input.extend([scale_tensor_name])
+
+        if len(caffe_op.blobs) == 2:
+            bias_tensor_name = scale_op_name + '_offset'
+            bias_data = caffe_op.blobs[1]
+            # caffe of old version has 4-dimension bias, so reshape it
+            # to single dimension
+            self.add_tensor(bias_tensor_name, bias_data.reshape(-1).shape,
+                            mace_pb2.DT_FLOAT,
+                            bias_data)
+            op.input.extend([bias_tensor_name])
+
+            biasadd_op = self._mace_net_def.op.add()
+            biasadd_op.name = scale_op_name + '_biasadd'
+            biasadd_op.type = MaceOp.BiasAdd.name
+            biasadd_op.output.extend(op.output)
+            op.output[:] = [op.output[0] + '_prod_output']
+            biasadd_op.input.extend(op.output)
+            biasadd_op.input.extend([op.input[2]])
+
+            biasadd_op.output_shape.extend(op.output_shape)
+
+            del op.input[2]
+
+            data_type_arg = biasadd_op.arg.add()
+            data_type_arg.name = 'T'
+            data_type_arg.i = self._option.data_type
+
+            ConverterUtil.add_data_format_arg(biasadd_op,
+                                              DataFormat.NCHW)
diff --git a/mace/python/tools/converter_tool/shape_inference.py b/mace/python/tools/converter_tool/shape_inference.py
index fbc22783ce09156f34cf859c82019d882c9c2dd7..9478a3e545f9dc203ea01f03b1d32a4452ad964b 100644
--- a/mace/python/tools/converter_tool/shape_inference.py
+++ b/mace/python/tools/converter_tool/shape_inference.py
@@ -47,6 +47,7 @@ class ShapeInference(object):
             MaceOp.Softmax.name: self.infer_shape_general,
             MaceOp.FullyConnected.name: self.infer_shape_fully_connected,
             MaceOp.Crop.name: self.infer_shape_crop,
+            MaceOp.BiasAdd.name: self.infer_shape_general,
         }
 
         self._net = net
diff --git a/mace/python/tools/converter_tool/transformer.py b/mace/python/tools/converter_tool/transformer.py
index 7d6893442acf982529ea5c2b38425bf33795eb5a..d24823fd61adfb6c73fd8433151a8901fb00fc45 100644
--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -344,12 +344,14 @@ class Transformer(base_converter.ConverterInterface):
                     == EltwiseType.PROD.value) \
                     and len(op.input) == 2 \
                     and op.input[1] in self._consts \
+                    and op.output_shape[0].dims[-1:] == \
+                    self._consts[op.input[1]].dims \
                     and self.consumer_count(op.output[0]) == 1 \
                     and not self.is_op_output_node(op):
                 consumer_op = self._consumers[op.output[0]][0]
                 if (consumer_op.type == MaceOp.Eltwise.name
                     and ConverterUtil.get_arg(
-                        op, MaceKeyword.mace_element_type_str).i
+                        consumer_op, MaceKeyword.mace_element_type_str).i
                         == EltwiseType.SUM.value
                     or consumer_op.type == MaceOp.BiasAdd.name) \
                         and len(consumer_op.input) == 2 \
@@ -359,10 +361,8 @@ class Transformer(base_converter.ConverterInterface):
                     consumer_op.type = MaceOp.BatchNorm.name
                     consumer_op.input[:] = [op.input[0], op.input[1],
                                             consumer_op.input[1]]
-
-                    self.safe_remove_node(op, None)
+                    net.op.remove(op)
                     return True
-
         return False
 
     def fold_squared_diff_mean(self):