diff --git a/x2paddle/convert.py b/x2paddle/convert.py
index 84bafc0f0467944406343f565076a16e5bb57e1f..8b76ea159e7a66aeab25fccceb5ab6140c802502 100644
--- a/x2paddle/convert.py
+++ b/x2paddle/convert.py
@@ -137,14 +137,17 @@ def onnx2paddle(model_path, save_dir):
     except:
         print("onnx is not installed, use \"pip install onnx==1.5.0\".")
         return
+    print("Now translating model from onnx to paddle.")
 
     from x2paddle.decoder.onnx_decoder import ONNXDecoder
-    from x2paddle.op_mapper.onnx_op_mapper import ONNXOpMapper
-    from x2paddle.optimizer.onnx_optimizer import ONNXOptimizer
-    print("Now translating model from onnx to paddle.")
     model = ONNXDecoder(model_path)
+
+    from x2paddle.op_mapper.onnx_op_mapper import ONNXOpMapper
     mapper = ONNXOpMapper(model)
+
+    from x2paddle.optimizer.onnx_optimizer import ONNXOptimizer
     optimizer = ONNXOptimizer(mapper)
+
     optimizer.delete_redundance_code()
     mapper.save_inference_model(save_dir)
 
diff --git a/x2paddle/decoder/onnx_backend.py b/x2paddle/decoder/onnx_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeebeb92731281302fb5334a88baf86e04c0360b
--- /dev/null
+++ b/x2paddle/decoder/onnx_backend.py
@@ -0,0 +1,1088 @@
+#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Part of the following code in this file refs to https://github.com/pytorch/pytorch/blob/master/caffe2/python/onnx/backend.py
+# PyTorch is BSD-style licensed, as found in the LICENSE file: https://github.com/pytorch/pytorch/blob/master/LICENSE
+"""Backend for running ONNX on Caffe2
+
+To run this, you will need to have Caffe2 installed as well.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import os
+import collections
+from subprocess import Popen, PIPE
+import zipfile
+import itertools
+
+# When onnx is built against a version of protobuf that is older than
+# that which is vendored with caffe2, onnx will crash if caffe2's
+# vendored protobuf is loaded first. We can work around this by
+# importing onnx first, which will cause it to go out and pick up the
+# system protobuf.
+import onnx.backend
+
+import caffe2
+from caffe2.python import core, workspace, rnn_cell, gru_cell
+from caffe2.python.compatibility import container_abcs
+from caffe2.python.model_helper import ModelHelper
+from caffe2.proto import caffe2_pb2
+import caffe2.python.utils
+import numpy as np
+import onnx
+from onnx import checker, GraphProto, TensorProto, AttributeProto, ModelProto
+import onnx.numpy_helper
+import onnx.defs
+import onnx.optimizer
+import onnx.shape_inference
+import onnx.utils
+from onnx.backend.base import Backend, Device, DeviceType, namedtupledict
+
+from caffe2.python.onnx.workspace import Workspace
+from caffe2.python.onnx.backend_rep import Caffe2Rep
+from caffe2.python.onnx.backend_cpp_rep import Caffe2CppRep
+
+import caffe2.python._import_c_extension as C
+
+import warnings
+
+
+def force_unicode(s):
+    try:
+        return s.decode('utf-8')
+    except AttributeError:
+        return s
+
+
+def get_device_option(device):
+    m = {
+        DeviceType.CPU: caffe2_pb2.CPU,
+        DeviceType.CUDA: workspace.GpuDeviceType
+    }
+    return core.DeviceOption(m[device.type], device.device_id)
+
+
+class OnnxAttributes(dict):
+    """
+    This is a more convenient way to work with ONNX/Caffe2 attributes
+    that is not the protobuf representation.
+    """
+    @staticmethod
+    def from_onnx(args):
+        d = OnnxAttributes()
+        for arg in args:
+            d[arg.name] = convertAttributeProto(arg)
+        return d
+
+    def caffe2(self, kmap=lambda k: k):
+        for k, v in self.items():
+            if kmap(k) != '':
+                yield caffe2.python.utils.MakeArgument(kmap(k), v)
+
+
+# TODO: Move this into ONNX main library
+def convertAttributeProto(onnx_arg):
+    """
+    Convert an ONNX AttributeProto into an appropriate Python object
+    for the type.
+
+    NB: Tensor attribute gets returned as the straight proto.
+    """
+    if onnx_arg.HasField('f'):
+        return onnx_arg.f
+    elif onnx_arg.HasField('i'):
+        return onnx_arg.i
+    elif onnx_arg.HasField('s'):
+        return onnx_arg.s
+    elif onnx_arg.HasField('t'):
+        return onnx_arg.t  # this is a proto!
+    elif onnx_arg.HasField('g'):
+        return Caffe2Backend._graph_to_net(onnx_arg.g,
+                                           Caffe2Backend._known_opset_version)
+    elif len(onnx_arg.floats):
+        return list(onnx_arg.floats)
+    elif len(onnx_arg.ints):
+        return list(onnx_arg.ints)
+    elif len(onnx_arg.strings):
+        return list(onnx_arg.strings)
+    elif len(onnx_arg.graphs):
+        retval = []
+        # TODO: this doesn't work with RNN ops
+        for g in onnx_arg.graphs:
+            retval.append(
+                Caffe2Backend._graph_to_net(g,
+                                            Caffe2Backend._known_opset_version))
+        return retval
+    else:
+        raise ValueError("Unsupported ONNX attribute: {}".format(onnx_arg))
+
+
+# TODO: Move this into ONNX main library
+class OnnxNode(object):
+    """
+    Reimplementation of NodeProto from ONNX, but in a form
+    more convenient to work with from Python.
+
+    We may temporarily edit these nodes to get them into Caffe2 form,
+    before actually translating into the Caffe2 protobuf, since this
+    is easier than decomposing everything, and putting it back together
+    when we're ready.
+    """
+    def __init__(self, node):
+        self.name = str(node.name)
+        self.op_type = str(node.op_type)
+        self.attrs = OnnxAttributes.from_onnx(node.attribute)
+        self.inputs = list(node.input)
+        self.outputs = list(node.output)
+
+
+Caffe2Ops = collections.namedtuple('Caffe2Ops',
+                                   ['ops', 'init_ops', 'interface_blobs'])
+
+
+class Caffe2Backend(Backend):
+
+    # The greatest version of the ONNX operator set which we are aware of.
+    # Models whose version is larger than this will cause us to emit a warning
+    # that we are attempting to translate on a "best effort" basis.
+    #
+    # If you increase this, make SURE you cross-reference all BC-breaking
+    # changes from one version to the next, and any that you did not
+    # implement, mark as broken in _broken_operators
+    _known_opset_version = 9
+
+    # This dictionary will record operators which are KNOWN to be
+    # broken, so we give a good error message rather than do something
+    # bogus and then fail.
+    _broken_operators = {
+        # 'BrokenOp': version_it_was_broken_in
+    }
+
+    # Operators that are different between Caffe2 and
+    # ONNX but only in their name.
+    # In most cases, this should be empty - as the effort of ONNX is
+    # to unify the operator definitions.
+    _renamed_operators = {
+        'GlobalMaxPool': 'MaxPool',
+        'GlobalAveragePool': 'AveragePool',
+        'Pad': 'PadImage',
+        'Neg': 'Negative',
+        'BatchNormalization': 'SpatialBN',
+        'InstanceNormalization': 'InstanceNorm',
+        'MatMul': 'BatchMatMul',
+        'Upsample': 'ResizeNearest',
+        'Identity': 'Copy',
+        'InstanceNormalization': 'InstanceNorm',
+        'Equal': 'EQ',
+        'Less': 'LT',
+        'Greater': 'GT',
+        'Unsqueeze': 'ExpandDims',
+        'Loop': 'ONNXWhile',
+        'Tile': 'NumpyTile',
+        'RandomNormal': 'GaussianFill',
+        'RandomUniform': 'UniformFill',
+    }
+
+    _global_renamed_attrs = {'kernel_shape': 'kernels'}
+    _per_op_renamed_attrs = {
+        'Squeeze': {
+            'axes': 'dims'
+        },
+        'Unsqueeze': {
+            'axes': 'dims'
+        },
+        'Transpose': {
+            'perm': 'axes'
+        },
+        'Upsample': {
+            'mode': '',
+            'scales': ''
+        },
+        'ConvTranspose': {
+            'output_padding': 'adjs'
+        },
+        'Selu': {
+            'gamma': 'scale'
+        },
+        'If': {
+            'then_branch': 'then_net',
+            'else_branch': 'else_net'
+        },
+        'RandomUniform': {
+            'low': 'min',
+            'high': 'max'
+        }
+    }
+
+    # operators whose behavior is different beyond renaming
+    # the value is an attribute of this class that is a
+    # function from ToffeIR node_def to caffe2 op_def
+    _special_operators = {
+        'LSTM': '_create_rnn_variant',
+        'GRU': '_create_rnn_variant',
+        'RNN': '_create_rnn_variant',
+        'Loop': '_create_loop',
+        'If': '_create_if',
+        'Upsample': '_create_upsample',
+        'RandomNormal': '_create_gaussian_fill'
+    }
+
+    # Dummy name generator
+    _dummy_name = C.DummyName()
+
+    @classmethod
+    def dummy_name(cls):
+        return cls._dummy_name.new_dummy_name()
+
+    # NB: By default, you will use the LATEST definition of the operator,
+    # so this interface MAY make BC-breaking changes.  Specify an
+    # opset_version if you don't want this to version.
+    @classmethod
+    def run_node(cls,
+                 node,
+                 inputs,
+                 device='CPU',
+                 opset_version=_known_opset_version,
+                 outputs_info=None):
+        super(Caffe2Backend, cls).run_node(node,
+                                           inputs,
+                                           device=device,
+                                           outputs_info=outputs_info,
+                                           opset_version=opset_version)
+
+        value_infos = []
+        device_option = get_device_option(Device(device))
+        ws = Workspace()
+        with core.DeviceScope(device_option):  # temporary!
+            if isinstance(inputs, dict):
+                for key, value in inputs.items():
+                    ws.FeedBlob(key, value)
+                    value_infos.append(
+                        onnx.helper.make_tensor_value_info(
+                            name=key,
+                            elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[
+                                value.dtype],
+                            shape=value.shape).SerializeToString())
+            else:
+                assert len(node.input) == len(
+                    inputs), "{}: expected {} but got {}".format(
+                        node.op_type, len(node.input), len(inputs))
+                for key, value in zip(node.input, inputs):
+                    ws.FeedBlob(key, value)
+                    value_infos.append(
+                        onnx.helper.make_tensor_value_info(
+                            name=key,
+                            elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[
+                                value.dtype],
+                            shape=value.shape).SerializeToString())
+
+            ops = []
+            cbackend = C.Caffe2Backend(cls._dummy_name)
+            ops_str = cbackend.convert_node(node.SerializeToString(),
+                                            value_infos, opset_version)
+            for s in ops_str[0] + ops_str[1]:
+                op = caffe2_pb2.OperatorDef()
+                op.ParseFromString(s)
+                op.device_option.CopyFrom(device_option)
+                ops.append(op)
+            ws.RunOperatorsOnce(ops)
+            output_values = [ws.FetchBlob(name) for name in node.output]
+            return namedtupledict('Outputs', node.output)(*output_values)
+
+    @classmethod
+    def _create_tensor_filling_op(cls, onnx_tensor, name=None):
+        """
+        Given an Onnx TensorProto, translate it into a Caffe2 operator
+        which produces the given tensor filling op.
+        """
+        assert name or onnx_tensor.name
+        name = name or onnx_tensor.name
+
+        c2_op = caffe2_pb2.OperatorDef()
+
+        c2_values = c2_op.arg.add()
+        c2_values.name = "values"
+
+        def tensor2list(onnx_tensor):
+            # Use the onnx.numpy_helper because the data may be raw
+            return onnx.numpy_helper.to_array(onnx_tensor).flatten().tolist()
+
+        if onnx_tensor.data_type in [TensorProto.FLOAT]:
+            c2_op.type = 'GivenTensorFill'
+            c2_values.floats.extend(tensor2list(onnx_tensor))
+        elif onnx_tensor.data_type in [TensorProto.DOUBLE]:
+            c2_op.type = 'GivenTensorDoubleFill'
+            c2_values.floats.extend(tensor2list(onnx_tensor))
+        elif onnx_tensor.data_type in [TensorProto.INT64, TensorProto.UINT32]:
+            c2_op.type = 'GivenTensorInt64Fill'
+            c2_values.ints.extend(tensor2list(onnx_tensor))
+        elif onnx_tensor.data_type in [
+                TensorProto.UINT8, TensorProto.INT8, TensorProto.UINT16,
+                TensorProto.INT16, TensorProto.INT32
+        ]:
+            c2_op.type = 'GivenTensorIntFill'
+            c2_values.ints.extend(tensor2list(onnx_tensor))
+        elif onnx_tensor.data_type == TensorProto.BOOL:
+            c2_op.type = 'GivenTensorBoolFill'
+            c2_values.ints.extend(tensor2list(onnx_tensor))
+        elif onnx_tensor.data_type == TensorProto.STRING:
+            c2_op.type = 'GivenTensorStringFill'
+            c2_values.strings.extend(onnx_tensor.string_data)
+        else:
+            raise RuntimeError("unrecognized tensor type {}".format(
+                onnx_tensor.data_type))
+
+        c2_shape = c2_op.arg.add()
+        c2_shape.name = "shape"
+        c2_shape.ints.extend(onnx_tensor.dims)
+
+        c2_op.output.append(name)
+
+        return c2_op
+
+    @classmethod
+    def _rnn_reform_weights(cls, reforms, name, hidden_size, init_net, gates,
+                            reorder_indices):
+        for name_from, name_to, do_concat, extra_dims in reforms:
+            gate_blobs = [
+                '%s/%s_%s' % (name, prefix, name_to) for prefix in gates
+            ]
+            for i, x in enumerate(gate_blobs):
+                dim0 = i * hidden_size, (i + 1) * hidden_size
+                starts, ends = zip(dim0, *extra_dims)
+                init_net.Slice(name_from, x, starts=starts, ends=ends)
+            if do_concat:
+                reordered_gate_blobs = [gate_blobs[i] for i in reorder_indices]
+                init_net.Concat(reordered_gate_blobs,
+                                ['%s/%s' % (name, name_to),
+                                 cls.dummy_name()],
+                                axis=0)
+
+    @classmethod
+    def _make_rnn_direction(cls, input_blob, B, W, R, initial_states_and_names,
+                            sequence_lens, pred_mh, init_net, input_size,
+                            hidden_size, num_gates, direction_offset, Bi, Br,
+                            W_, R_, reform, make_cell, keep_outputs):
+        name = cls.dummy_name()
+
+        # input and recurrence biases are squashed together in onnx
+        # but not in caffe2
+        gates_hidden_size = num_gates * hidden_size
+        bias_offset = 2 * direction_offset * gates_hidden_size
+        weight_offset = direction_offset * gates_hidden_size
+        Bi = init_net.Slice(B,
+                            name + Bi,
+                            starts=[bias_offset + 0 * gates_hidden_size],
+                            ends=[bias_offset + 1 * gates_hidden_size])
+        Br = init_net.Slice(B,
+                            name + Br,
+                            starts=[bias_offset + 1 * gates_hidden_size],
+                            ends=[bias_offset + 2 * gates_hidden_size])
+        W_ = init_net.Slice(W,
+                            name + W_,
+                            starts=[weight_offset + 0 * gates_hidden_size, 0],
+                            ends=[weight_offset + 1 * gates_hidden_size, -1])
+        R_ = init_net.Slice(R,
+                            name + R_,
+                            starts=[weight_offset + 0 * gates_hidden_size, 0],
+                            ends=[weight_offset + 1 * gates_hidden_size, -1])
+
+        initial_states_sliced = []
+        for initial_state, name_suffix in initial_states_and_names:
+            initial_states_sliced.append(
+                pred_mh.net.Slice(initial_state,
+                                  name + name_suffix,
+                                  starts=[direction_offset + 0, 0, 0],
+                                  ends=[direction_offset + 1, -1, -1]))
+
+        if direction_offset == 1:
+            if sequence_lens is not None:
+                seq_lens_for_reverse = sequence_lens
+            else:
+                input_shape = pred_mh.net.Shape(input_blob,
+                                                name + '/input_shape')
+                batch_size = pred_mh.net.Slice(input_shape,
+                                               name + '/batch_size_slice',
+                                               starts=[1],
+                                               ends=[2])
+                seq_len = pred_mh.net.Slice(input_shape,
+                                            name + '/seq_len_slice',
+                                            starts=[0],
+                                            ends=[1])
+                dummy_sequence_lens = pred_mh.net.Tile([seq_len, batch_size],
+                                                       name +
+                                                       '/dummy_sequence_lens',
+                                                       axis=0)
+                pred_mh.net.Reshape(
+                    dummy_sequence_lens,
+                    [dummy_sequence_lens, cls.dummy_name()],
+                    shape=[-1])
+                seq_lens_for_reverse = pred_mh.net.Cast(dummy_sequence_lens,
+                                                        name +
+                                                        '/seq_lens_for_reverse',
+                                                        to=core.DataType.INT32)
+        reform(Bi, Br, W_, R_, name, hidden_size, init_net)
+
+        if direction_offset == 1:
+            input = pred_mh.net.ReversePackedSegs(
+                [input_blob, seq_lens_for_reverse], name + "/input-reversed")
+        else:
+            input = input_blob
+
+        outputs = keep_outputs(
+            list(
+                make_cell(
+                    pred_mh,
+                    input,
+                    sequence_lens,
+                    initial_states_sliced,
+                    input_size,
+                    hidden_size,
+                    name,
+                    drop_states=False,
+                    forward_only=True,
+                )))
+
+        if direction_offset == 1:
+            outputs[0] = pred_mh.net.ReversePackedSegs(
+                [outputs[0], seq_lens_for_reverse], name + "/output-reversed")
+
+        return outputs
+
+    @classmethod
+    def _create_rnn_variant(cls, init_model, pred_model, n, opset_version):
+        assert init_model is not None, "cannot convert RNNs without access to the full model"
+        assert pred_model is not None, "cannot convert RNNs without access to the full model"
+
+        attrs = dict(n.attrs)  # make a copy, which is safe to mutate
+        hidden_size = attrs.pop('hidden_size')
+        direction = force_unicode(attrs.pop('direction', 'forward'))
+
+        if n.op_type == 'RNN':
+            activation = force_unicode(
+                attrs.pop('activations', ('tanh', ))[0].lower())
+        elif n.op_type == 'GRU':
+            linear_before_reset = attrs.pop('linear_before_reset', 0)
+
+        assert not attrs, "unsupported RNN attributes: " + str(attrs.keys())
+        assert direction in ['forward', 'bidirectional'
+                             ], "unsupported backwards RNN/GRU/LSTM"
+
+        if n.op_type in ['RNN', 'GRU']:
+            input_blob, W, R, B, sequence_lens, initial_h = n.inputs
+        elif n.op_type == 'LSTM':
+            input_blob, W, R, B, sequence_lens, initial_h, initial_c = n.inputs
+
+        if sequence_lens == "":
+            sequence_lens = None
+
+        for x in itertools.chain(init_model.graph.input,
+                                 init_model.graph.value_info,
+                                 pred_model.graph.input,
+                                 pred_model.graph.value_info):
+            if x.name == W:
+                input_size = x.type.tensor_type.shape.dim[2].dim_value
+                break
+        else:
+            raise RuntimeError(
+                "best-effort shape inference for RNN/GRU/LSTM failed")
+
+        pred_mh = ModelHelper()
+        init_net = core.Net("init-net")
+
+        init_net.Reshape(W, [W, cls.dummy_name()], shape=[1, -1, 0])
+        init_net.Squeeze(W, W, dims=[0])
+        init_net.Reshape(R, [R, cls.dummy_name()], shape=[1, -1, 0])
+        init_net.Squeeze(R, R, dims=[0])
+        init_net.Reshape(B, [B, cls.dummy_name()], shape=[1, -1])
+        init_net.Squeeze(B, B, dims=[0])
+
+        if n.op_type == 'RNN':
+
+            def reform(*args):
+                pass
+
+            def make_cell(*args, **kwargs):
+                return rnn_cell.BasicRNN(*args, activation=activation, **kwargs)
+
+            def make_rnn(direction_offset):
+                return cls._make_rnn_direction(
+                    input_blob, B, W, R, [(initial_h, '/initial_h')],
+                    sequence_lens, pred_mh, init_net, input_size, hidden_size,
+                    1, direction_offset, "/i2h_b", "/gates_t_b", "/i2h_w",
+                    "/gates_t_w", reform, make_cell, lambda x: x)
+
+        elif n.op_type == 'GRU':
+
+            def reform(Bi, Br, W_, R_, name, hidden_size, init_net):
+                # caffe2 has a different order from onnx. We need to rearrange
+                #  z r h  -> r z h
+                reforms = ((W_, 'i2h_w', True, [(0, -1)]), (R_, 'gate_t_w',
+                                                            False, [(0, -1)]),
+                           (Bi, 'i2h_b', True, []), (Br, 'gate_t_b', False, []))
+                cls._rnn_reform_weights(reforms, name, hidden_size, init_net,
+                                        ['update', 'reset', 'output'],
+                                        [1, 0, 2])
+
+            def make_cell(*args, **kwargs):
+                return gru_cell.GRU(*args,
+                                    linear_before_reset=linear_before_reset,
+                                    **kwargs)
+
+            def make_rnn(direction_offset):
+                return cls._make_rnn_direction(
+                    input_blob, B, W, R, [(initial_h, '/initial_h')],
+                    sequence_lens, pred_mh, init_net, input_size, hidden_size,
+                    3, direction_offset, "_bias_i2h", "_bias_gates",
+                    "/i2h_w_pre", "/gates_t_w_pre", reform, make_cell,
+                    lambda x: x)
+
+        elif n.op_type == 'LSTM':
+
+            def reform(Bi, Br, W_, R_, name, hidden_size, init_net):
+                # caffe2 has a different order from onnx. We need to rearrange
+                #   i o f c -> i f o c
+                reforms = ((W_, 'i2h_w', True, [(0, -1)]), (R_, 'gates_t_w',
+                                                            True, [(0, -1)]),
+                           (Bi, 'i2h_b', True, []), (Br, 'gates_t_b', True, []))
+                cls._rnn_reform_weights(reforms, name, hidden_size, init_net,
+                                        ['input', 'output', 'forget', 'cell'],
+                                        [0, 2, 1, 3])
+
+            def make_cell(*args, **kwargs):
+                return rnn_cell.LSTM(*args, **kwargs)
+
+            def make_rnn(direction_offset):
+                return cls._make_rnn_direction(
+                    input_blob, B, W, R, [(initial_h, '/initial_h'),
+                                          (initial_c, '/initial_c')],
+                    sequence_lens, pred_mh, init_net, input_size, hidden_size,
+                    4, direction_offset, "/i2h_b", "/gates_t_b", "/i2h_w",
+                    "/gates_t_w", reform, make_cell,
+                    lambda x: [x[0], x[1], x[3]])
+
+        if direction == 'forward':
+            outputs = make_rnn(0)
+
+            # in the forward case, storage is shared between the
+            # last outputs. We need to decouple them so that the
+            # VariableLengthSequencePadding only mutates
+            # n.outputs[0]
+            for i in range(1, len(outputs)):
+                pred_mh.net.Copy(outputs[i], n.outputs[i])
+
+            if sequence_lens is not None:
+                pred_mh.net.VariableLengthSequencePadding(
+                    [outputs[0], sequence_lens], [outputs[0]])
+            pred_mh.net.ExpandDims([outputs[0]], [n.outputs[0]], dims=[1])
+        elif direction == 'bidirectional':
+            outputs_f = make_rnn(0)
+            outputs_b = make_rnn(1)
+
+            concatted_output, _ = pred_mh.net.Concat(
+                [outputs_f[0], outputs_b[0]],
+                [cls.dummy_name(), cls.dummy_name()],
+                axis=2)
+            if sequence_lens is not None:
+                pred_mh.net.VariableLengthSequencePadding(
+                    [concatted_output, sequence_lens], [concatted_output])
+            reshaped_output, _ = pred_mh.net.Reshape(
+                concatted_output,
+                [cls.dummy_name(), cls.dummy_name()],
+                shape=[0, 0, -1, 2])
+            pred_mh.net.Transpose(reshaped_output,
+                                  n.outputs[0],
+                                  axes=[0, 2, 1, 3])
+            for i in range(1, len(n.outputs)):
+                pred_mh.net.Concat(
+                    [outputs_f[i], outputs_b[i]],
+                    [n.outputs[i], cls.dummy_name()],
+                    axis=0)
+
+        # We want to decide whether to put all of our weight-reshaping
+        # operators in the init net or the predict net. We can put
+        # them in the init net iff the inputs to those operators are
+        # already available, either as graph initializers, or as the
+        # output of other operators in the init net. The latter case
+        # occurs, for example, when exporting from pytorch to onnx.
+        # In most production use, we expect has_initializers to be
+        # true.
+        initializers = {i.name for i in init_model.graph.initializer}
+        outputs = {
+            output
+            for node in init_model.graph.node for output in node.output
+        }
+        has_initializers = all(x in initializers or x in outputs
+                               for x in (W, R, B))
+
+        pred_ops = []
+        init_ops = []
+        (init_ops if has_initializers else pred_ops).extend(init_net.Proto().op)
+        pred_ops.extend(pred_mh.Proto().op)
+
+        return Caffe2Ops(pred_ops, init_ops,
+                         list(pred_mh.Proto().external_input))
+
+    @classmethod
+    def _create_control_op(cls, init_model, pred_model, n, opset_version):
+        control_inputs = []
+        if '__control_inputs' in n.attrs:
+            control_inputs.extend(n.attrs['__control_inputs'])
+        node = cls._common_onnx_node_to_caffe2_op(init_model, pred_model, n,
+                                                  opset_version)
+        node.control_input.extend(control_inputs)
+        return Caffe2Ops([node], [], [])
+
+    @classmethod
+    def _remove_ssa(cls, net, remap_dict):
+        for op in net.op:
+            for i, name in enumerate(op.output):
+                if name in remap_dict:
+                    op.output[i] = remap_dict[name]
+        for i, out in enumerate(net.external_output):
+            if out in remap_dict:
+                net.external_output[i] = remap_dict[out]
+
+    @classmethod
+    def _create_if(cls, init_model, pred_model, n, opset_version):
+        ops = cls._create_control_op(init_model, pred_model, n, opset_version)
+        assert ops[0][0].type == 'If'
+        if_op = ops[0][0]
+        then_net = else_net = None
+        control_inputs = []
+        for arg in if_op.arg:
+            if arg.name == 'then_net':
+                then_net = arg.n
+            if arg.name == 'else_net':
+                else_net = arg.n
+            if arg.name == '__control_inputs':
+                control_inputs = arg.strings
+
+        assert then_net and else_net
+        then_net_outs = then_net.external_output
+        else_net_outs = else_net.external_output
+        op_outputs = if_op.output
+        assert len(then_net_outs) == len(else_net_outs)
+        assert len(else_net_outs) == len(op_outputs)
+
+        for arg in if_op.arg:
+            if arg.name == 'then_net':
+                arg.n.external_input.extend(control_inputs)
+            if arg.name == 'else_net':
+                arg.n.external_input.extend(control_inputs)
+
+        return ops
+
+    @classmethod
+    def _create_loop(cls, init_model, pred_model, n, opset_version):
+        ops = cls._create_control_op(init_model, pred_model, n, opset_version)
+        assert ops[0][0].type == 'ONNXWhile'
+        while_op = ops[0][0]
+        while_op.arg.extend(
+            [caffe2.python.utils.MakeArgument('has_trip_count', True)])
+        while_op.arg.extend(
+            [caffe2.python.utils.MakeArgument('has_cond', True)])
+        while_op.arg.extend(
+            [caffe2.python.utils.MakeArgument('disable_scopes', True)])
+        control_inputs = []
+        for arg in while_op.arg:
+            if arg.name == '__control_inputs':
+                control_inputs = arg.strings
+        num_loop_carried_deps = 0
+        for arg in while_op.arg:
+            if arg.name == 'body':
+                num_loop_carried_deps = len(arg.n.external_input) - 2
+                arg.n.external_input.extend(control_inputs)
+        while_op.arg.extend([
+            caffe2.python.utils.MakeArgument('num_loop_carried_deps',
+                                             num_loop_carried_deps)
+        ])
+
+        return ops
+
+    @classmethod
+    def _substitute_raw_value(cls, tp, raw_values_dict):
+        if tp.HasField('raw_data') and tp.raw_data == bytes(b'__EXTERNAL'):
+            if tp.name not in raw_values_dict:
+                raise RuntimeError(
+                    'TensorProto for value {} referenced raw data but it was not found!'
+                    .format(tp.name))
+            else:
+                tp.raw_data = raw_values_dict[tp.name]
+
+    @classmethod
+    def _visit_and_substitute_raw_values(cls, nodes, raw_values_dict):
+        for node in nodes:
+            for attr in node.attribute:
+                if attr.HasField('t'):
+                    cls._substitute_raw_value(attr.t, raw_values_dict)
+                for t in attr.tensors:
+                    cls._substitute_raw_value(t, raw_values_dict)
+                if attr.HasField('g'):
+                    cls._visit_and_substitute_raw_values(
+                        attr.g.node, raw_values_dict)
+                for g in attr.graphs:
+                    cls._visit_and_substitute_raw_values(
+                        g.node, raw_values_dict)
+
+    @classmethod
+    def _external_value_resolution_pass(cls, model, raw_values_dict):
+        for init in model.graph.initializer:
+            cls._substitute_raw_value(init, raw_values_dict)
+
+        cls._visit_and_substitute_raw_values(model.graph.node, raw_values_dict)
+
+    @classmethod
+    def _direct_initialize_parameters(cls, initializer, ws, device_option):
+        for tp in initializer:
+            ws.FeedBlob(tp.name, onnx.numpy_helper.to_array(tp), device_option)
+
+    @classmethod
+    def _direct_initialize_inputs(cls, inputs, initialized, ws, device_option):
+        for value_info in inputs:
+            if value_info.name in initialized:
+                continue
+            shape = list(d.dim_value
+                         for d in value_info.type.tensor_type.shape.dim)
+            ws.FeedBlob(
+                value_info.name,
+                np.ones(shape,
+                        dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[
+                            value_info.type.tensor_type.elem_type]),
+                device_option)
+
+    @staticmethod
+    def optimize_onnx(input, init=False, predict=False):
+        passes = [
+            'fuse_consecutive_transposes', 'eliminate_nop_transpose',
+            'fuse_transpose_into_gemm', 'lift_lexical_references'
+        ]
+        if init:
+            passes.append('split_init')
+        if predict:
+            passes.append('split_predict')
+        out = onnx.optimizer.optimize(input, passes)
+        return out
+
+    @classmethod
+    def prepare_zip_archive(cls, file, device='CPU', **kwargs):
+        with zipfile.ZipFile(file, mode='r') as z:
+            with z.open('__MODEL_PROTO', 'r') as f:
+                model = onnx.load(f)
+            blob_names = set(z.namelist()) - set('__MODEL_PROTO')
+            # TODO: make this more efficient
+            raw_values_dict = {}
+            for name in blob_names:
+                with z.open(name, 'r') as blob_file:
+                    raw_values_dict[name] = blob_file.read()
+
+        return cls.prepare(model,
+                           device,
+                           raw_values_dict=raw_values_dict,
+                           **kwargs)
+
+    @classmethod
+    def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs):
+        '''
+        For Onnx Caffe2Backend, we require that init_graph don't initialize the actual input of the predict_graph,
+
+        for example, if "img" is the input blob for the predict_net, we require that in init_graph and in
+        initializer of the predict_graph, "img" is not initalized. We don't have a check for this, since
+        there is no way we can know which blob is the input of the predict_graph.
+        '''
+        if not kwargs.pop('no_check_UNSAFE', False):
+            super(Caffe2Backend, cls).prepare(model, device, **kwargs)
+        opset_version = None
+        for imp in model.opset_import:
+            if not imp.HasField("domain") or imp.domain == "":
+                opset_version = imp.version
+                if imp.version > cls._known_opset_version:
+                    warnings.warn(
+                        "This version of onnx-caffe2 targets ONNX operator set version {}, but the model we are trying to import uses version {}.  We will try to import it anyway, but if the model uses operators which had BC-breaking changes in the intervening versions, import will fail."
+                        .format(cls._known_opset_version, imp.version))
+            else:
+                warnings.warn("Unrecognized operator set {}".format(imp.domain))
+        if opset_version is None:
+            if model.ir_version >= 0x00000003:
+                raise RuntimeError(
+                    "Model with IR version >= 3 did not specify ONNX operator set version (onnx-caffe2 requires it)"
+                )
+            else:
+                opset_version = 1
+
+        ws = Workspace()
+        device_option = get_device_option(Device(device))
+
+        init_net, predict_net = cls._onnx_model_to_caffe2_net(
+            model, device, opset_version, False)
+
+        if raw_values_dict:
+            cls._external_value_resolution_pass(model, raw_values_dict)
+
+        # Directly load initializer data into blobs in workspace
+        cls._direct_initialize_parameters(
+            model.graph.initializer,
+            ws,
+            device_option,
+        )
+
+        initialized = {init.name for init in model.graph.initializer}
+
+        cls._direct_initialize_inputs(
+            model.graph.input,
+            initialized,
+            ws,
+            device_option,
+        )
+
+        uninitialized = [
+            value_info.name for value_info in model.graph.input
+            if value_info.name not in initialized
+        ]
+
+        retval = Caffe2Rep(init_net, predict_net, ws, uninitialized)
+        return retval
+
+    @classmethod
+    # TODO: This method needs a refactor for clarity
+    def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def,
+                                opset_version):
+        cbackend = C.Caffe2Backend(cls._dummy_name)
+        if cbackend.support_onnx_import(node_def.op_type):
+
+            # extract value infos from pred model (value infos of
+            # node's inputs that are in init model should be all
+            # available in pred model)
+            value_infos = []
+            for name in node_def.input:
+                if pred_model is not None:
+                    for vi in itertools.chain(pred_model.graph.input,
+                                              pred_model.graph.output,
+                                              pred_model.graph.value_info):
+                        if vi.name == name:
+                            value_infos.append(vi.SerializeToString())
+
+            op_strs = cbackend.convert_node(node_def.SerializeToString(),
+                                            value_infos, opset_version)
+            init_ops = []
+            for s in op_strs[0]:
+                op = caffe2_pb2.OperatorDef()
+                op.ParseFromString(s)
+                init_ops.append(op)
+            ops = []
+            for s in op_strs[1]:
+                op = caffe2_pb2.OperatorDef()
+                op.ParseFromString(s)
+                ops.append(op)
+            return Caffe2Ops(ops, init_ops, [])
+
+        if node_def.op_type in cls._special_operators:
+            translator = getattr(cls, cls._special_operators[node_def.op_type])
+        else:
+            translator = cls._common_onnx_node_to_caffe2_op
+        ops = translator(init_model, pred_model, OnnxNode(node_def),
+                         opset_version)
+        if isinstance(ops, Caffe2Ops):
+            return ops
+        if not isinstance(ops, container_abcs.Iterable):
+            ops = [ops]
+        return Caffe2Ops(ops, [], [])
+
+    _broadcast_operators = {
+        'Add',
+        'Sub',
+    }
+
+    @classmethod
+    def _common_onnx_node_to_caffe2_op(cls, init_model, pred_model, onnx_node,
+                                       opset_version):
+        """
+        This translator performs the basic translation of ONNX nodes into
+        Caffe2 operators.  Besides doing a straightforward marshalling from
+        one format to another, it also does these extra things:
+
+          - Renames operators based on '_renamed_operators'
+          - Renames attributes based on '_global_renamed_attrs' and
+            '_per_op_renamed_attrs'
+
+        If you're writing a custom translator, consider calling this first,
+        and then fixing things up further.
+        """
+        c2_op = caffe2_pb2.OperatorDef()
+
+        c2_op.input.extend(onnx_node.inputs)
+        c2_op.output.extend(onnx_node.outputs)
+        c2_op.name = onnx_node.name
+
+        onnx_op_type = onnx_node.op_type
+        broken_version = cls._broken_operators.get(onnx_op_type, float('Inf'))
+        if broken_version <= opset_version:
+            raise ValueError(
+                "Don't know how to translate op {} in ONNX operator set v{} (I only support prior to v{})"
+                .format(onnx_op_type, opset_version, broken_version))
+        c2_op.type = cls._renamed_operators.get(onnx_op_type, onnx_op_type)
+        if not core.IsOperator(c2_op.type):
+            raise ValueError(
+                "Don't know how to translate op {}".format(onnx_op_type))
+
+        def kmap(k):
+            if (onnx_op_type in cls._per_op_renamed_attrs
+                    and k in cls._per_op_renamed_attrs[onnx_op_type]):
+                return cls._per_op_renamed_attrs[onnx_op_type][k]
+            if k in cls._global_renamed_attrs:
+                return cls._global_renamed_attrs[k]
+            return k
+
+        c2_op.arg.extend(onnx_node.attrs.caffe2(kmap=kmap))
+
+        if opset_version < 7:
+            # onnx opset 7 and newest caffe2 have adopted full onnx broadcast semantics
+            # so we don't need this hack anymore
+            if c2_op.type in cls._broadcast_operators:
+                already_broadcast = False
+                for arg in c2_op.arg:
+                    if arg.name == 'broadcast':
+                        already_broadcast = True
+                if not already_broadcast:
+                    c2_op.arg.extend(
+                        [caffe2.python.utils.MakeArgument('broadcast', 1)])
+
+        return c2_op
+
+    @staticmethod
+    def _all_names_in_graph(graph):
+        if graph is None:
+            return set()
+
+        names = set()
+        names.update(value_info.name for value_info in graph.input)
+        names.update(value_info.name for value_info in graph.output)
+        for node in graph.node:
+            names.update(node.input)
+            names.update(node.output)
+        return names
+
+    @classmethod
+    def _graph_to_net(cls, onnx_graph, opset_version):
+        net = caffe2_pb2.NetDef()
+        for node in onnx_graph.node:
+            try:
+                c2ops = cls._onnx_node_to_caffe2_op(None, None, node,
+                                                    opset_version)
+            except Exception as e:
+                print('ONNX FATAL:', e)
+                continue
+            net.op.extend(c2ops.init_ops)
+            net.op.extend(c2ops.ops)
+            net.external_input.extend(c2ops.interface_blobs)
+        net.external_output.extend(value_info.name
+                                   for value_info in onnx_graph.output)
+        net.external_input.extend(value_info.name
+                                  for value_info in onnx_graph.input)
+        return net
+
+    @classmethod
+    def _onnx_model_to_caffe2_net(cls, onnx_model, device, opset_version,
+                                  include_initializers):
+        device_option = get_device_option(Device(device))
+
+        #         init_model = cls.optimize_onnx(onnx_model, init=True)
+        #         pred_model = cls.optimize_onnx(onnx_model, predict=True)
+
+        init_model = onnx_model
+        pred_model = onnx_model
+        init_net = caffe2_pb2.NetDef()
+        pred_net = caffe2_pb2.NetDef()
+
+        init_net.name = onnx_model.graph.name + '_init'
+        pred_net.name = onnx_model.graph.name + '_predict'
+
+        if include_initializers:
+            init_net.op.extend(
+                cls._create_tensor_filling_op(tp)
+                for tp in onnx_model.graph.initializer)
+
+        cls._dummy_name.reset(
+            cls._all_names_in_graph(init_model.graph)
+            | cls._all_names_in_graph(pred_model.graph))
+
+        success = True
+        for net, model in ((init_net, init_model), (pred_net, pred_model)):
+            net.device_option.CopyFrom(device_option)
+            for node in model.graph.node:
+                try:
+                    c2ops = cls._onnx_node_to_caffe2_op(init_model, pred_model,
+                                                        node, opset_version)
+                except Exception as e:
+                    success = False
+                    print('ONNX FATAL:', e)
+                    continue
+                init_net.op.extend(c2ops.init_ops)
+                net.op.extend(c2ops.ops)
+                net.external_input.extend(c2ops.interface_blobs)
+            net.external_output.extend(value_info.name
+                                       for value_info in model.graph.output)
+            net.external_input.extend(value_info.name
+                                      for value_info in model.graph.input)
+
+        if not success:
+            raise RuntimeError('ONNX conversion failed')
+
+        return init_net, pred_net
+
+    # wrapper for backwards compatability
+    @classmethod
+    def onnx_graph_to_caffe2_net(cls,
+                                 model,
+                                 device="CPU",
+                                 opset_version=_known_opset_version):
+        return cls._onnx_model_to_caffe2_net(model,
+                                             device=device,
+                                             opset_version=opset_version,
+                                             include_initializers=True)
+
+    @classmethod
+    def supports_device(cls, device_str):
+        device = Device(device_str)
+        if device.type == DeviceType.CPU:
+            return True
+        elif core.IsGPUDeviceType(device.type):
+            return workspace.has_gpu_support
+        return False
+
+    @classmethod
+    def is_compatible(cls, model, device='CPU', **kwargs):
+        if hasattr(super(Caffe2Backend, cls), 'is_compatible') \
+           and callable(super(Caffe2Backend, cls).is_compatible):
+            if not super(Caffe2Backend, cls).is_compatible(
+                    model, device, **kwargs):
+                return False
+        # TODO: should have an unspported list of operators, be optimistic for now
+        return True
+
+
+prepare = Caffe2Backend.prepare
+
+prepare_zip_archive = Caffe2Backend.prepare_zip_archive
+
+run_node = Caffe2Backend.run_node
+
+run_model = Caffe2Backend.run_model
+
+supports_device = Caffe2Backend.supports_device  # noqa
+
+is_compatible = Caffe2Backend.is_compatible
diff --git a/x2paddle/decoder/onnx_decoder.py b/x2paddle/decoder/onnx_decoder.py
index ad66a1430b189aaa1ba4b1e59dbfd52bbd11893a..959b5b12aeaea4c6a84aca84b79bfe712423d9b5 100644
--- a/x2paddle/decoder/onnx_decoder.py
+++ b/x2paddle/decoder/onnx_decoder.py
@@ -23,6 +23,7 @@ from onnx.helper import get_attribute_value, make_attribute
 from onnx.shape_inference import infer_shapes
 from onnx.mapping import TENSOR_TYPE_TO_NP_TYPE
 from onnx.numpy_helper import to_array
+from onnx import AttributeProto, TensorProto, GraphProto
 from collections import OrderedDict as Dict
 import onnx
 import numpy as np
@@ -44,7 +45,7 @@ class ONNXGraphNode(GraphNode):
         self.attr_map = self.get_attr_map()
         self.dtype_map = {1: "float32", 3: "int32", 9: "int64"}
         self.weight_inputs = list()
-        self.out_shapes = None
+        self.out_shapes = list()
         self.dtype = None
 
     def get_attr_map(self):
@@ -58,11 +59,10 @@ class ONNXGraphNode(GraphNode):
 
     @property
     def value(self):
-        assert 'Constant' in self.layer_type, "Only Constant node has value."
-
-        attr = self.layer.attr['value']
-        if 'value' in self.attr_map:
-            return default
+        assert 'Constant' in self.layer_type, "Only Constant | ConstantOfShape node has value."
+        attr = self.layer.attribute['value']
+        if 'value' not in self.attr_map:
+            return None
         return self.attr_map[name]
 
     def get_attribute_value2(self, attr):
@@ -110,23 +110,26 @@ class ONNXGraphDataNode(GraphNode):
     def out_shapes(self):
         values = self.layer.type.tensor_type.shape.dim
         out_shapes = list()
-        out_shapes = [dim.dim_value for dim in values]
+        out_shapes.append([dim.dim_value for dim in values])
         return out_shapes
 
     @property
     def dtype(self):
         dtype = self.layer.type.tensor_type.elem_type
-
         return TENSOR_TYPE_TO_NP_TYPE[dtype]
 
 
 class ONNXGraph(Graph):
-    def __init__(self, model):
-        super(ONNXGraph, self).__init__(model)
+    def __init__(self, graph, onnx_model):
+        super(ONNXGraph, self).__init__(graph)
+        self.onnx_model = onnx_model
         self.initializer = {}
         self.place_holder_nodes = list()
         self.get_place_holder_nodes()
 
+        self.value_infos = self.inferred_model_value_info(graph)
+        self.results_of_inference = dict()
+
     def get_inner_nodes(self):
         """
         generate inner node of ONNX model
@@ -162,17 +165,22 @@ class ONNXGraph(Graph):
         """
         build topo_sort of ONNX model
         """
+        data_node = self.place_holder_nodes[0]
+        value_info = self.value_infos[data_node]
+        input_shape = value_info['shape']
+        self.get_results_of_inference(self.onnx_model, input_shape)
         for layer in self.model.node:
-            self.node_map[layer.name] = ONNXGraphNode(layer)
-
-        #set op node's dtype and out_shapes
-        for item in self.model.value_info:
-            if item.name in self.node_map:
-                self.node_map[item.name].dtype = TENSOR_TYPE_TO_NP_TYPE[
-                    item.type.tensor_type.elem_type]
-                self.node_map[item.name].out_shapes = [
-                    dim.dim_value for dim in item.type.tensor_type.shape.dim
-                ]
+            node = ONNXGraphNode(layer)
+            self.node_map[layer.name] = node
+            for opt in layer.output:
+                if opt in self.value_infos:
+                    value_info = self.value_infos[opt]
+                    node.dtype = value_info['dtype']
+                    node.out_shapes.append(value_info['shape'])
+                else:
+                    _, dtype, shape = self.get_dynamic_shape(opt)
+                    node.dtype = dtype
+                    node.out_shapes.append(shape)
 
         for layer in self.model.input:
             if layer.name not in self.node_map:
@@ -199,7 +207,6 @@ class ONNXGraph(Graph):
                             format(in_node, layer_name))
                     else:
                         self.connect(in_node, layer_name)
-
         #generate topo
         super(ONNXGraph, self).build()
 
@@ -227,31 +234,108 @@ class ONNXGraph(Graph):
             weight = to_array(initializer)
             yield name, weight
 
+    def inferred_model_value_info(self, graph):
+        """
+        collect value/type info for an ONNX model
+        """
+        assert isinstance(graph,
+                          onnx.GraphProto), 'model is not a ModelProto instance'
+
+        value_info = Dict()
+        for item in graph.value_info:
+            value_info[item.name] = {
+                'dtype':
+                TENSOR_TYPE_TO_NP_TYPE[item.type.tensor_type.elem_type],
+                'shape':
+                [dim.dim_value for dim in item.type.tensor_type.shape.dim],
+                'external': False
+            }
+        for item in graph.input:
+            assert item.name not in value_info
+            value_info[item.name] = {
+                'dtype':
+                TENSOR_TYPE_TO_NP_TYPE[item.type.tensor_type.elem_type],
+                'shape':
+                [dim.dim_value for dim in item.type.tensor_type.shape.dim],
+                'external': True
+            }
+        for item in graph.output:
+            assert item.name not in value_info
+            value_info[item.name] = {
+                'dtype':
+                TENSOR_TYPE_TO_NP_TYPE[item.type.tensor_type.elem_type],
+                'shape':
+                [dim.dim_value for dim in item.type.tensor_type.shape.dim],
+                'external': True
+            }
+        return value_info
+
+    def get_results_of_inference(self, model, shape):
+        try:
+            import torch
+            version = torch.__version__
+            if '1.1.0' not in version:
+                print("your model have dynamic graph, torch==1.1.0 is required")
+                return
+        except:
+            print(
+                "your model have dynamic graph, we use caff2 to inference graph, please use \"pip install torch==1.1.0\"."
+            )
+            return
+        from x2paddle.decoder.onnx_backend import prepare
+
+        np_images = np.random.rand(shape[0], shape[1], shape[2],
+                                   shape[3]).astype('float32')
+
+        outputs = []
+        for node in model.graph.node:
+            value_info = helper.make_tensor_value_info(node.name,
+                                                       TensorProto.UNDEFINED,
+                                                       [])
+            outputs.append(value_info)
+
+        while len(outputs) > 0:
+            tmp_outputs = outputs[:254]
+            model.graph.ClearField('output')
+            model.graph.output.MergeFrom(tmp_outputs)
+            prepared_backend = prepare(model,
+                                       device='CPU',
+                                       no_check_UNSAFE=True)
+            res = prepared_backend.run(inputs=np_images)
+            for idx, info in enumerate(tmp_outputs):
+                self.results_of_inference[info.name] = res[idx]
+            outputs = outputs[254:]
+        return
+
+    def get_dynamic_shape(self, layer):
+        """
+        get dynamic shape from caffe2.backend
+        """
+        output = self.results_of_inference[layer]
+        return output.tolist(), output.dtype, output.shape
+
 
 class ONNXDecoder(object):
     def __init__(self, onnx_model):
         model = onnx.load(onnx_model)
         print('model ir_version: {}, op version: {}'.format(
             model.ir_version, model.opset_import[0].version))
-
         if model.opset_import[0].version < 9:
             _logger.warning(
                 'Now, onnx2paddle main support convert onnx model opset_verison == 9,'
                 'opset_verison of your onnx model is %d < 9,'
                 'some operator may cannot convert.',
                 model.opset_import[0].version)
-        check_model(model)
-
-        model = polish_model(model)
 
+        check_model(model)
+        model = onnx.shape_inference.infer_shapes(model)
         model = self.optimize_model_skip_op_for_inference(model)
         model = self.optimize_model_strip_initializer(model)
         self.standardize_variable_name(model.graph)
 
         self.model = model
         graph_def = model.graph
-
-        self.onnx_graph = ONNXGraph(graph_def)
+        self.onnx_graph = ONNXGraph(graph_def, model)
         self.onnx_graph.build()
 
     def build_value_refs(self, nodes):
@@ -334,9 +418,13 @@ class ONNXDecoder(object):
                                                     output_name, output_refs)
             else:
                 processed = -1
-
             if processed > 0:
                 nodes_to_remove.append(node_idx)
+                for value_info in ret.graph.value_info:
+                    for output in node.output:
+                        if value_info.name == output:
+                            ret.graph.value_info.remove(value_info)
+
                 print('skip op {}: {} -> {} -> {}'.format(
                     node_idx, input_name, node.op_type, output_name))
             elif processed == 0:
@@ -396,7 +484,6 @@ class ONNXDecoder(object):
         """
         standardize variable name for paddle's code
         """
-
         for initializer in graph.initializer:
             initializer.name = self.make_variable_name(initializer.name)
         for ipt in graph.input:
@@ -455,43 +542,3 @@ class ONNXDecoder(object):
             raise RuntimeError("Input mismatch {} != {}".format(
                 len(onnx_model.input), len(model.input)))
         return onnx_model
-
-    def get_dynamic_shape_from_caffe2(self, layer, input_shapes):
-        """
-        get dynamic shape from caffe2.backend
-        """
-        try:
-            import torch
-            version = torch.__version__
-            if '1.1.0' not in version:
-                print("your model have dynamic graph, torch==1.1.0 is required")
-                return
-        except:
-            print(
-                "your model have dynamic graph, we use caff2 to inference graph, please use \"pip install torch==1.1.0\"."
-            )
-            return
-        from caffe2.python.onnx.backend import prepare
-        shape = input_shapes[0]
-        np_images = np.random.rand(shape[0], shape[1], shape[2],
-                                   shape[3]).astype('float32')
-        num_onnx = self.split_model(self.model, layer)
-        prepared_backend = prepare(num_onnx, device='CPU')
-        output = prepared_backend.run(inputs=np_images)
-        return output[0].tolist()
-
-    def get_dynamic_shape_from_onnx(self, layer, input_shapes):
-        """
-        get dynamic shape from onnxruntime
-        """
-        import onnxruntime as rt
-        from onnxruntime.backend import prepare
-        import numpy as np
-        num_onnx = self.split_model(self.model, layer)
-        sess = prepare(num_onnx)
-        shape = input_shapes[0]
-        print(shape)
-        np_images = np.random.rand(shape[0], shape[1], shape[2],
-                                   shape[3]).astype('float32')
-        output = sess.run(model=sess, inputs=np_images)
-        return output[0].tolist()
diff --git a/x2paddle/op_mapper/onnx_custom_layer/InstanceNormalization.py b/x2paddle/op_mapper/onnx_custom_layer/InstanceNormalization.py
new file mode 100644
index 0000000000000000000000000000000000000000..f93f7a723f17c927a26dc12a5f20a27f112546f2
--- /dev/null
+++ b/x2paddle/op_mapper/onnx_custom_layer/InstanceNormalization.py
@@ -0,0 +1,59 @@
+#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .register import register
+
+
+def InstanceNormalization_shape(input_shape):
+    return input_shape
+
+
+def InstanceNormalization_layer(inputs, name=None):
+    # TODO(lvmengsi@baidu.com): Check the accuracy when using fluid.layers.layer_norm.
+    epsilon = 1e-5
+    mean = fluid.layers.reduce_mean(inputs, dim=[2, 3], keep_dim=True)
+    var = fluid.layers.reduce_mean(fluid.layers.square(inputs - mean),
+                                   dim=[2, 3],
+                                   keep_dim=True)
+    if name is not None:
+        scale_name = name + "_scale"
+        offset_name = name + "_offset"
+    scale_param = fluid.ParamAttr(name=scale_name,
+                                  initializer=fluid.initializer.Constant(1.0),
+                                  trainable=True)
+    offset_param = fluid.ParamAttr(name=offset_name,
+                                   initializer=fluid.initializer.Constant(0.0),
+                                   trainable=True)
+    scale = fluid.layers.create_parameter(attr=scale_param,
+                                          shape=inputs.shape[1:2],
+                                          dtype="float32")
+    offset = fluid.layers.create_parameter(attr=offset_param,
+                                           shape=inputs.shape[1:2],
+                                           dtype="float32")
+
+    tmp = fluid.layers.elementwise_mul(x=(inputs - mean), y=scale, axis=1)
+    tmp = tmp / fluid.layers.sqrt(var + epsilon)
+    tmp = fluid.layers.elementwise_add(tmp, offset, axis=1)
+    return tmp
+
+
+def InstanceNormalization_weights(name, data=None):
+    weights_name = [name + '_scale']
+    return weights_name
+
+
+register(kind='InstanceNormalization',
+         shape=InstanceNormalization_shape,
+         layer=InstanceNormalization_layer,
+         weights=InstanceNormalization_weights)
diff --git a/x2paddle/op_mapper/onnx_custom_layer/__init__.py b/x2paddle/op_mapper/onnx_custom_layer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c21ce95ef487405e9cd93300b5e3831fe79c6dd4
--- /dev/null
+++ b/x2paddle/op_mapper/onnx_custom_layer/__init__.py
@@ -0,0 +1,104 @@
+#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .register import get_registered_layers
+#custom layer import begins
+
+from . import InstanceNormalization
+#custom layer import ends
+
+custom_layers = get_registered_layers()
+
+
+def set_args(f, params):
+    """ set args for function 'f' using the parameters in node.layer.param
+    Args:
+        f (function): a python function object
+        params (object): a object contains attributes needed by f's arguments
+    Returns:
+        arg_names (list): a list of argument names
+        kwargs (dict): a dict contains needed arguments
+    """
+    argc = f.__code__.co_argcount
+    arg_list = f.__code__.co_varnames[0:argc]
+    kwargs = {}
+    for arg_name in arg_list:
+        if hasattr(params, arg_name) and params is not None:
+            kwargs[arg_name] = getattr(params, arg_name)
+    return arg_list, kwargs
+
+
+def has_layer(layer_type):
+    """ test whether this layer exists in custom layer
+    """
+    return layer_type in custom_layers
+
+
+def get_params(layer, layer_type):
+    import re
+    if layer_type.lower() == "deconvolution" or layer_type.lower(
+    ) == "convolutiondepthwise":
+        param_name = '_'.join(('convolution', 'param'))
+    elif layer_type.lower() == "normalize":
+        param_name = '_'.join(('norm', 'param'))
+    elif len(layer_type) - len(re.sub("[A-Z]", "", layer_type)) >= 2:
+        s = ''
+        tmp_name = ''
+        for i, ch in enumerate(layer_type):
+            if i == 0:
+                s += ch.lower()
+                continue
+            elif ch.isupper() and layer_type[i - 1].islower():
+                tmp_name += (s + '_')
+                s = ''
+            s += ch.lower()
+        tmp_name += s
+        param_name = '_'.join((tmp_name, 'param'))
+    else:
+        param_name = '_'.join((layer_type.lower(), 'param'))
+    return getattr(layer, param_name, None)
+
+
+def compute_output_shape(node):
+    """ compute the output shape of custom layer
+    """
+    layer_type = node.layer_type
+    assert layer_type in custom_layers, "layer[%s] not exist in custom layers" % (
+        layer_type)
+    shape_func = custom_layers[layer_type]['shape']
+    layer = node.layer
+    params = get_params(layer, layer_type)
+    arg_names, kwargs = set_args(shape_func, params)
+    input_shape = node.input_shape
+    return shape_func(input_shape, **kwargs)
+
+
+def make_custom_layer(node):
+    """ get the code which implement the custom layer function
+    """
+    layer_type = node.layer_type
+    assert layer_type in custom_layers, "layer[%s] not exist in custom layers" % (
+        layer_type)
+    layer_func = custom_layers[layer_type]['layer']
+    import inspect
+    return inspect.getsource(layer_func), layer_func
+
+
+def deal_weights(node, data=None):
+    """ deal the weights of the custom layer
+    """
+    layer_type = node.layer_type
+    weights_func = custom_layers[layer_type]['weights']
+    name = node.layer_name
+    return weights_func(name, data)
diff --git a/x2paddle/op_mapper/onnx_custom_layer/register.py b/x2paddle/op_mapper/onnx_custom_layer/register.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca3643c0497bba1a77fad8c2b819bbba08b8ba05
--- /dev/null
+++ b/x2paddle/op_mapper/onnx_custom_layer/register.py
@@ -0,0 +1,56 @@
+#   Copyright (c) 2019  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" this module provides 'register' for registering customized layers
+"""
+
+g_custom_layers = {}
+
+
+def register(kind, shape, layer, weights):
+    """ register a custom layer or a list of custom layers
+
+    Args:
+        @kind (str or list): type name of the layer
+        @shape (function): a function to generate the shape of layer's output
+        @layer (function): a function to generate the paddle code of layer
+        @weights (function): a function to deal with weights data
+
+    Returns:
+        None
+    """
+    assert type(shape).__name__ == 'function', 'shape should be a function'
+    assert type(layer).__name__ == 'function', 'layer should be a function'
+
+    if type(kind) is str:
+        kind = [kind]
+    else:
+        assert type(
+            kind
+        ) is list, 'invalid param "kind" for register, not a list or str'
+
+    for k in kind:
+        assert type(
+            k) is str, 'invalid param "kind" for register, not a list of str'
+        assert k not in g_custom_layers, 'this type[%s] has already been registered' % (
+            k)
+        print('register layer[%s]' % (k))
+        g_custom_layers[k] = {
+            'shape': shape,
+            'layer': layer,
+            'weights': weights
+        }
+
+
+def get_registered_layers():
+    return g_custom_layers
diff --git a/x2paddle/op_mapper/onnx_directly_map.py b/x2paddle/op_mapper/onnx_directly_map.py
index c6edbfba4c124e49d7ff30ed9e9528d1c45639fc..bf26dbd911c3fb824f7cd7308a9a743220e3b4a0 100644
--- a/x2paddle/op_mapper/onnx_directly_map.py
+++ b/x2paddle/op_mapper/onnx_directly_map.py
@@ -24,6 +24,7 @@ default_op_mapping_field_values['DEFAULTS'] = dict()
 default_op_mapping_field_values['INPUT_PERM'] = None
 default_op_mapping_field_values['OUTPUT_PERM'] = None
 default_op_mapping_field_values['FILL_NAME_FIELD'] = True
+
 default_op_mapping = {
     'Gather': ['gather', ['X'], ['Out'],
                dict(axis='')],
@@ -46,8 +47,44 @@ default_op_mapping = {
         dict(axes='dim', keepdims='keep_dim'),
         dict(keep_dim=1)
     ],
+    'ReduceSum': [
+        'reduce_sum', ['X'], ['Out'],
+        dict(axes='dim', keepdims='keep_dim'),
+        dict(keep_dim=1)
+    ],
+
+    #active function
+    'Relu': ['relu', ['X'], ['Out']],
+    'LeakyRelu': ['leaky_relu', ['X'], ['Out'],
+                  dict(), dict(alpha=.01)],
+    'Elu': ['elu', ['X'], ['Out'],
+            dict(), dict(alpha=1.)],
+    'ThresholdedRelu': [
+        'thresholded_relu', ['X'], ['Out'],
+        dict(alpha='threshold'),
+        dict(alpha=1.)
+    ],
+    'Tanh': ['tanh', ['X'], ['Out']],
+    'Sigmoid': ['sigmoid', ['X'], ['Out']],
+    'Pow': ['elementwise_pow', ['X', 'Y'], ['Out'],
+            dict(),
+            dict(axis=-1)],  # TODO: pow for scalar exponent
+    'HardSigmoid': [
+        'hard_sigmoid', ['X'], ['Out'],
+        dict(alpha='slope', beta='offset'),
+        dict(slope=.2, offset=.5)
+    ],
+    'Softsign': ['softsign', ['X'], ['Out']],
+    'Softplus': ['softplus', ['X'], ['Out']],
+    'Exp': ['exp', ['X'], ['Out']],
+    'Softmax': ['softmax', ['X'], ['Out'],
+                dict(axis=''),
+                dict(axis=1)],
+}
+
+activefunc_op_mapping = {
     'LeakyRelu': ['leaky_relu', ['X'], ['Out'],
-                  dict(), dict(alpha=.01)]
+                  dict(), dict(alpha=.01)],
 }
 
 default_ioa_constraint = {
diff --git a/x2paddle/op_mapper/onnx_op_mapper.py b/x2paddle/op_mapper/onnx_op_mapper.py
index 4eedaa686c29c683246faada7f9f49db11301e43..f71cb1ee73080640d91de9b7d5ebb77cc4a08c32 100644
--- a/x2paddle/op_mapper/onnx_op_mapper.py
+++ b/x2paddle/op_mapper/onnx_op_mapper.py
@@ -14,14 +14,16 @@
 
 from x2paddle.core.graph import GraphNode
 from x2paddle.core.op_mapper import OpMapper
-from x2paddle.core.util import *
 from x2paddle.core.fluid_code import Layer
 from x2paddle.core.fluid_code import FluidCode
 from x2paddle.decoder.onnx_decoder import ONNXGraph, ONNXGraphNode, ONNXGraphDataNode
 from x2paddle.op_mapper.onnx_directly_map import default_op_mapping_field_values
 from x2paddle.op_mapper.onnx_directly_map import default_op_mapping
 from x2paddle.op_mapper.onnx_directly_map import default_ioa_constraint
+from x2paddle.op_mapper.onnx_custom_layer import *
+from x2paddle.core.util import string
 import numpy as np
+import onnx.numpy_helper as numpy_helper
 import logging as _logging
 from collections import OrderedDict as _dict
 
@@ -52,12 +54,12 @@ class ONNXOpMapper(OpMapper):
         self.input_shapes = []
         self.weights = dict()
         self.omit_nodes = list()
+        self.used_custom_layers = dict()
 
         if not self.op_checker():
             raise Exception("Model are not supported yet.")
 
         #mapping op
-
         print("Total nodes: {}".format(
             sum([
                 isinstance(node, ONNXGraphNode)
@@ -71,13 +73,17 @@ class ONNXOpMapper(OpMapper):
                 func(node)
             elif op in default_op_mapping:
                 self.directly_map(node)
+            elif op in custom_layers:
+                self.deal_custom_layer(node)
 
     def op_checker(self):
         unsupported_ops = set()
         for node_name in self.graph.topo_sort:
             node = self.graph.get_node(node_name)
             op = node.layer_type
-            if not hasattr(self, op) and op not in default_op_mapping:
+            if not hasattr(
+                    self, op
+            ) and op not in default_op_mapping and op not in custom_layers:
                 unsupported_ops.add(op)
         if len(unsupported_ops) == 0:
             return True
@@ -133,11 +139,28 @@ class ONNXOpMapper(OpMapper):
                                   output=val_outs[0],
                                   param_attr=attr)
 
+    def deal_custom_layer(self, node):
+        op = node.layer_type
+        val_x = self.graph.get_node(node.layer.input[0], copy=True)
+        custom_code, func = make_custom_layer(node)
+        params = get_params(node.layer, node.layer_type)
+        arg_names, kwargs = set_args(func, params)
+        kwargs['name'] = string(node.layer_name)
+        inputs_node = []
+        inputs_node.append(node.inputs[0])
+        node.fluid_code.add_layer(func.__code__.co_name,
+                                  inputs=inputs_node[0],
+                                  output=node,
+                                  param_attr=kwargs,
+                                  is_custom_layer=True)
+        if op not in self.used_custom_layers:
+            self.used_custom_layers[op] = custom_code
+
     def place_holder(self, node):
-        self.input_shapes.append(node.out_shapes)
+        self.input_shapes.append(node.out_shapes[0])
         attr = {
             "dtype": string(node.dtype),
-            "shape": node.out_shapes,
+            "shape": node.out_shapes[0],
             "name": string(node.layer_name),
             "append_batch_size": 'False'
         }
@@ -151,7 +174,7 @@ class ONNXOpMapper(OpMapper):
         if parameter is not None:
             node = parameter
         dtype = node.dtype
-        shape = node.out_shapes
+        shape = node.out_shapes[0]
 
         self.weights[node.layer_name] = node.weight
         attr = {
@@ -179,13 +202,55 @@ class ONNXOpMapper(OpMapper):
         val_padded = self.Pad(node, op_independent=False)
         return [0] * ndims, val_padded
 
+    def _interpolate(self, node):
+        val_x = self.graph.get_node(node.layer.input[0], copy=True)
+        val_scales = self.graph.get_node(node.layer.input[1], copy=True)
+        val_y = self.graph.get_node(node.layer.output[0], copy=True)
+
+        out_shape_ = val_y.out_shapes[0]
+        if out_shape_ is not None:
+            assert len(out_shape_) == 4, 'only 4-D Tensor as X and Y supported'
+            out_shape_ = out_shape_[2:]
+        scales = _const_weight_or_none(val_scales)
+        if scales is not None:
+            assert len(scales) == 4, 'only 4-D Tensor as X and Y supported'
+            assert scales[0] == 1 and scales[
+                1] == 1, 'only scale on (NC)HW supported'
+            assert scales[2] == scales[
+                3], 'only aspect-ratio-invariant scale supported'
+        scale = scales[2] if scales else None
+        if scale is None:
+            assert out_shape_, 'neither scales nor output shape is available'
+            out_shape = out_shape_
+        else:
+            out_shape = None
+            if out_shape_ is None:
+                in_shape = val_x.out_shapes[0]
+                assert in_shape is not None, 'out_shape required but not inferrable'
+                assert len(
+                    in_shape) == 4, 'only 4-D Tensor as X and Y supported'
+                out_shape_ = [in_shape[2] * scale, in_shape[3] * scale]
+
+        mode = node.get_attr('mode', 'nearest')
+        fluid_op = 'resize_{}'.format(mode)
+
+        attr = {
+            'scale': scale,
+            'out_shape': out_shape,
+            'name': string(node.layer_name)
+        }
+        node.fluid_code.add_layer(fluid_op,
+                                  inputs=val_x,
+                                  output=node,
+                                  param_attr=attr)
+
     def Pad(self, node, op_independent=True):
         val_x = self.graph.get_node(node.layer.input[0], copy=True)
         pads = node.get_attr('pads')
         mode = node.get_attr('mode', 'constant')
         value = node.get_attr('value', 0.)
-        data_shape = val_x.out_shapes
-        output_shape = node.out_shapes
+        data_shape = val_x.out_shapes[0]
+        output_shape = node.out_shapes[0]
         assume_pad2d = False
         attr = {}
         if len(pads) == 4:
@@ -200,8 +265,6 @@ class ONNXOpMapper(OpMapper):
             attr['mode'] = string(mode)
         else:
             attr = {'pad_value': value}
-            assert mode == 'constant', 'mode {} is supported only in pad2d'.format(
-                mode)
             fluid_op = 'pad'
         if len(pads) == 4:
             paddings = np.array(pads).reshape(
@@ -209,6 +272,10 @@ class ONNXOpMapper(OpMapper):
         elif len(pads) == 8:
             paddings = np.array(pads).reshape(
                 (-1, 4)).transpose().flatten().tolist()  # SSEE -> SESE
+            if sum(paddings[:4]) == 0:
+                fluid_op = 'pad2d'
+                paddings = paddings[4:]
+                attr['mode'] = string(mode)
         attr['paddings'] = paddings
         if op_independent:
             attr['name'] = string(node.layer_name)
@@ -233,6 +300,17 @@ class ONNXOpMapper(OpMapper):
                                   output=node,
                                   param_attr=attr)
 
+    def Shrink(self, node):
+        val_x = self.graph.get_node(node.layer.input[0], copy=True)
+        bias = node.get_attr('bias')
+        lambd = node.get_attr('lambd')
+        assert bias == 0.0, 'not support bias!=0'
+        attr = {'threshold': lambd, 'name': node.layer_name}
+        node.fluid_code.add_layer('hard_shrink',
+                                  inputs=val_x,
+                                  output=node,
+                                  param_attr=attr)
+
     def Constant(self, node):
         val_output = self.graph.get_node(node.layer.output[0], copy=True)
 
@@ -244,7 +322,7 @@ class ONNXOpMapper(OpMapper):
 
         shape = node.get_attr('shape', None)
         if shape is None:
-            shape = val_output.out_shapes
+            shape = val_output.out_shapes[0]
         if shape is None:
             shape = list(value.shape)
             _logger.warning(
@@ -253,8 +331,8 @@ class ONNXOpMapper(OpMapper):
                 'using value as 1-D tensor may lead to fails',
                 val_output.layer_name, val_output.layer_name)
 
-        value = value.tolist()
         if len(value) == 1:  # scalar
+            value = value.tolist()
             shape = [1]
             value = value[0]
             if dtype.name == 'int64':
@@ -264,14 +342,27 @@ class ONNXOpMapper(OpMapper):
                                       inputs=None,
                                       output=node,
                                       param_attr=attr)
+        else:
+            value = np.reshape(value, shape)
+            self.weights[node.layer_name] = value
+            attr = {
+                'dtype': string(dtype),
+                'shape': shape,
+                'name': string(node.layer_name),
+                'attr': string(node.layer_name),
+                'default_initializer': 'Constant(0.0)'
+            }
+            node.fluid_code.add_layer("create_parameter",
+                                      inputs=None,
+                                      output=node,
+                                      param_attr=attr)
 
     def Resize(self, node):
-        # I/O
         val_x = self.graph.get_node(node.layer.input[0], copy=True)
         val_scales = self.graph.get_node(node.layer.input[1], copy=True)
-        val_y, = self.graph.get_node(node.layer.output[0], copy=True)
+        val_y = self.graph.get_node(node.layer.output[0], copy=True)
 
-        out_shape_ = val_y.out_shapes
+        out_shape_ = val_y.out_shapes[0]
         if out_shape_ is not None:
             assert len(out_shape_) == 4, 'only 4-D Tensor as X and Y supported'
             out_shape_ = out_shape_[2:]
@@ -289,7 +380,7 @@ class ONNXOpMapper(OpMapper):
         else:
             out_shape = None
             if out_shape_ is None:
-                in_shape = val_x.out_shapes
+                in_shape = val_x.out_shapes[0]
                 assert in_shape is not None, 'out_shape required but not inferrable'
                 assert len(
                     in_shape) == 4, 'only 4-D Tensor as X and Y supported'
@@ -297,8 +388,6 @@ class ONNXOpMapper(OpMapper):
 
         mode = node.get_attr('mode', 'nearest')
         fluid_op = 'resize_{}'.format(mode)
-        name_attr = ', name={}'.format(repr(name)) if name else ''
-
         attr = {
             'scale': scale,
             'out_shape': out_shape,
@@ -309,13 +398,40 @@ class ONNXOpMapper(OpMapper):
                                   output=node,
                                   param_attr=attr)
 
+    def Upsample(self, node):
+        self._interpolate(node)
+
+    def Slice(self, node):
+        val_x = self.graph.get_node(node.layer.input[0], copy=True)
+        val_y = self.graph.get_node(node.layer.output[0], copy=True)
+
+        axes = node.get_attr('axes')
+        starts = node.get_attr('starts')
+        ends = node.get_attr('ends')
+        shape = val_x.out_shapes[0]
+
+        if shape is not None:
+            for idx, value in enumerate(starts):
+                if value > 2**63 - 1 // 2:
+                    value = value - ONNX_INT_MAX
+                    starts[idx] = shape[axes[idx]] + value
+            for idx, value in enumerate(ends):
+                if value > 2**63 - 1 // 2:
+                    value = value - ONNX_INT_MAX
+                    ends[idx] = shape[axes[idx]] + value
+        attr = {"axes": axes, "starts": starts, "ends": ends}
+        node.fluid_code.add_layer('slice',
+                                  inputs=val_x,
+                                  output=node,
+                                  param_attr=attr)
+
     def ConstantOfShape(self, node):
         val_shape = self.graph.get_node(node.layer.input[0], copy=True)
-
+        val_y = self.graph.get_node(node.layer.output[0], copy=True)
         shape = _const_weight_or_none(val_shape)
 
         if shape is None:
-            shape = node.out_shapes
+            shape = node.out_shapes[0]
 
         assert shape is not None, (
             'given shape is neither const value nor deductible from output, '
@@ -359,10 +475,10 @@ class ONNXOpMapper(OpMapper):
 
         # catch dynamic graph shape
         if isinstance(val_shape, ONNXGraphNode):
-            shape = self.decoder.get_dynamic_shape_from_caffe2(
-                val_shape.layer_name, self.input_shapes)
+            shape, _, _ = self.decoder.onnx_graph.get_dynamic_shape(
+                val_shape.layer_name)
         if shape is None:
-            shape = val_reshaped.out_shapes
+            shape = val_reshaped.out_shapes[0]
 
         shape_dtype = val_shape.dtype
 
@@ -415,9 +531,10 @@ class ONNXOpMapper(OpMapper):
         pads = node.get_attr('pads', [0] * (poolnd * 2))
         fluid_op = 'pool{}d'.format(poolnd)
         assert 2 <= poolnd <= 3, 'only pool2d and pool3d is supported'
+
+        input_shape = val_x.out_shapes[0]
         paddings, val_x = self._pad_if_asymmetric(node, pads, val_x)
 
-        input_shape = val_x.out_shapes
         if auto_pad == "SAME_UPPER" or auto_pad == "SAME_LOWER":
             pad_h = get_same_padding(input_shape[2], kernel_shape[0],
                                      strides[0])
@@ -572,14 +689,6 @@ class ONNXOpMapper(OpMapper):
                                   output=node,
                                   param_attr=attr)
 
-    def Softmax(self, node):
-        val_x = self.graph.get_node(node.layer.input[0], copy=True)
-        attr = {"name": string(node.layer_name)}
-        node.fluid_code.add_layer("softmax",
-                                  inputs=val_x,
-                                  output=node,
-                                  param_attr=attr)
-
     def Transpose(self, node):
         val_x = self.graph.get_node(node.layer.input[0], copy=True)
         perm = node.get_attr('perm')
@@ -589,15 +698,79 @@ class ONNXOpMapper(OpMapper):
                                   output=node,
                                   param_attr=attr)
 
+    def Mul(self, node):
+        val_x = self.graph.get_node(node.layer.input[0], copy=True)
+        val_y = self.graph.get_node(node.layer.input[1], copy=True)
+
+        val_x_shape = val_x.out_shapes[0]
+        val_y_shape = val_y.out_shapes[0]
+
+        slice_idx = 0
+        for dim in val_y_shape:
+            if dim == 1:
+                slice_idx += 1
+            else:
+                break
+        attr = {"name": string(node.layer_name)}
+        if slice_idx < len(val_y_shape) and slice_idx > 0:
+            val_y_reshaped = val_y_shape[slice_idx:]
+            var_y_reshaped = val_y.layer_name + '_reshaped'
+            attr_reshaped = {
+                'shape': val_y_reshaped,
+                'name': string(var_y_reshaped)
+            }
+            node.fluid_code.add_layer('reshape',
+                                      inputs=val_y,
+                                      output=var_y_reshaped,
+                                      param_attr=attr_reshaped)
+            inputs = {'x': val_x, 'y': var_y_reshaped}
+            node.fluid_code.add_layer("elementwise_mul",
+                                      inputs=inputs,
+                                      output=node,
+                                      param_attr=attr)
+        else:
+            inputs = {'x': val_x, 'y': val_y}
+            node.fluid_code.add_layer("elementwise_mul",
+                                      inputs=inputs,
+                                      output=node,
+                                      param_attr=attr)
+
     def Div(self, node):
         val_x = self.graph.get_node(node.layer.input[0], copy=True)
         val_y = self.graph.get_node(node.layer.input[1], copy=True)
-        inputs = {'x': val_x, 'y': val_y}
+
+        val_x_shape = val_x.out_shapes[0]
+        val_y_shape = val_y.out_shapes[0]
+
+        slice_idx = 0
+        for dim in val_y_shape:
+            if dim == 1:
+                slice_idx += 1
+            else:
+                break
         attr = {"name": string(node.layer_name)}
-        node.fluid_code.add_layer("elementwise_div",
-                                  inputs=inputs,
-                                  output=node,
-                                  param_attr=attr)
+        if slice_idx < len(val_y_shape) and slice_idx > 0:
+            val_y_reshaped = val_y_shape[slice_idx:]
+            var_y_reshaped = val_y.layer_name + '_reshaped'
+            attr_reshaped = {
+                'shape': val_y_reshaped,
+                'name': string(var_y_reshaped)
+            }
+            node.fluid_code.add_layer('reshape',
+                                      inputs=val_y,
+                                      output=var_y_reshaped,
+                                      param_attr=attr_reshaped)
+            inputs = {'x': val_x, 'y': var_y_reshaped}
+            node.fluid_code.add_layer("elementwise_div",
+                                      inputs=inputs,
+                                      output=node,
+                                      param_attr=attr)
+        else:
+            inputs = {'x': val_x, 'y': val_y}
+            node.fluid_code.add_layer("elementwise_div",
+                                      inputs=inputs,
+                                      output=node,
+                                      param_attr=attr)
 
     def Relu(self, node):
         val_x = self.graph.get_node(node.layer.input[0], copy=True)
@@ -610,12 +783,17 @@ class ONNXOpMapper(OpMapper):
     def PRelu(self, node):
         val_x = self.graph.get_node(node.layer.input[0], copy=True)
         val_slope = self.graph.get_node(node.layer.input[1], copy=True)
-        attr = {"name": string(node.layer_name), "mode": string('channel')}
 
-        if isinstance(val_slope, str):
-            attr["param_attr"] = string(val_slope.layer_name)
-        else:
-            attr["param_attr"] = string(val_slope.layer_name)
+        mode = 'channel'
+        shape_slope = val_slope.out_shapes[0]
+        if len(shape_slope) == 1:
+            mode = 'all'
+        elif len(shape_slope) > 2:
+            mode = 'element'
+        attr = {
+            "param_attr": string(val_slope.layer_name),
+            'mode': string(mode)
+        }
         node.fluid_code.add_layer("prelu",
                                   inputs=val_x,
                                   output=node,
@@ -649,9 +827,10 @@ class ONNXOpMapper(OpMapper):
         pads = node.get_attr('pads', [0] * (poolnd * 2))  # optional
         fluid_op = 'pool{}d'.format(poolnd)
         assert 2 <= poolnd <= 3, 'only pool2d and pool3d is supported'
+
+        input_shape = val_x.out_shapes[0]
         paddings, val_x = self._pad_if_asymmetric(node, pads, val_x)
 
-        input_shape = val_x.out_shapes
         if auto_pad == "SAME_UPPER" or auto_pad == "SAME_LOWER":
             pad_h = get_same_padding(input_shape[2], kernel_shape[0],
                                      strides[0])
@@ -676,8 +855,8 @@ class ONNXOpMapper(OpMapper):
     def GlobalAveragePool(self, node):
         val_x = self.graph.get_node(node.layer.input[0], copy=True)
         val_y = self.graph.get_node(node.layer.output[0], copy=True)
-        input_shape = val_x.out_shapes
-        output_shape = val_y.out_shapes
+        input_shape = val_x.out_shapes[0]
+        output_shape = val_y.out_shapes[0]
         assert input_shape is not None or output_shape is not None, 'poolnd not inferred'  # N
         if input_shape:
             poolnd = len(input_shape) - 2  # NC...
@@ -701,7 +880,6 @@ class ONNXOpMapper(OpMapper):
         val_y = self.graph.get_node(node.layer.output[0], copy=True)
 
         self.omit_nodes.append(val_w.layer_name)
-        input_shape = val_x.out_shapes
 
         has_bias = len(node.layer.input) == 3
         if has_bias:
@@ -709,12 +887,12 @@ class ONNXOpMapper(OpMapper):
             self.omit_nodes.append(val_b.layer_name)
         auto_pad = node.get_attr('auto_pad', 'NOTSET')
 
-        kernel_shape = val_w.out_shapes[2:]  # OI...
+        kernel_shape = val_w.out_shapes[0][2:]  # OI...
         assert kernel_shape == node.get_attr(
             'kernel_shape'), 'kernel_shape in attr unmatches value_info'  # HW
         convnd = len(kernel_shape)
         assert 2 <= convnd <= 3, 'only conv2d and conv3d is supported'
-        num_out_channels = val_w.out_shapes[0]  # OI...
+        num_out_channels = val_w.out_shapes[0][0]  # OI...
         fluid_op = 'conv{}d'.format(convnd)
 
         num_groups = node.get_attr('group', 1)
@@ -722,6 +900,7 @@ class ONNXOpMapper(OpMapper):
         dilations = node.get_attr('dilations', [1] * convnd)  # optional
         pads = node.get_attr('pads', [0] * (convnd * 2))  # optional
 
+        input_shape = val_x.out_shapes[0]
         paddings, val_x = self._pad_if_asymmetric(node, pads, val_x)
 
         if auto_pad == "SAME_UPPER" or auto_pad == "SAME_LOWER":
@@ -749,3 +928,55 @@ class ONNXOpMapper(OpMapper):
                                   inputs=val_x,
                                   output=node,
                                   param_attr=attr)
+
+    def ConvTranspose(self, node):
+        val_x = self.graph.get_node(node.layer.input[0], copy=True)
+        val_w = self.graph.get_node(node.layer.input[1], copy=True)
+        val_b = self.graph.get_node(node.layer.input[2], copy=True)
+
+        self.omit_nodes.append(val_w.layer_name)
+        self.omit_nodes.append(val_b.layer_name)
+
+        val_y = self.graph.get_node(node.layer.output[0], copy=True)
+
+        auto_pad = node.get_attr('auto_pad', 'NOTSET')
+        out_padding = node.get_attr('output_padding', [0, 0])
+        kernel_shape = node.get_attr('kernel_shape', val_w.out_shapes[0][2:])
+        assert kernel_shape, 'kernel_shape not inferred'
+        convnd = len(kernel_shape)
+        assert 2 <= convnd <= 3, 'only conv2d_transpose and conv3d_transpose supported'
+        num_out_channels = val_w.out_shapes[0][1]
+        fluid_op = 'conv{}d_transpose'.format(convnd)
+
+        num_groups = node.get_attr('group', 1)
+        strides = node.get_attr('strides', [1] * convnd)
+        dilations = node.get_attr('dilations', [1] * convnd)
+        output_size = node.get_attr('output_shape', [])
+        pads = node.get_attr('pads', [0] * (convnd * 2))
+
+        paddings, var_x = self._pad_if_asymmetric(node, pads, val_x)
+
+        output_size = [0, 0]
+
+        output_size[0] = (val_x.out_shapes[0][2] -
+                          1) * strides[0] - 2 * paddings[0] + dilations[0] * (
+                              kernel_shape[0] - 1) + 1 + out_padding[0]
+        output_size[1] = (val_x.out_shapes[0][3] -
+                          1) * strides[1] - 2 * paddings[1] + dilations[1] * (
+                              kernel_shape[1] - 1) + 1 + out_padding[1]
+        attr = {
+            'num_filters': num_out_channels,
+            'output_size': output_size or None,
+            'filter_size': kernel_shape,
+            'padding': paddings,
+            'stride': strides,
+            'dilation': dilations,
+            'groups': num_groups,
+            'param_attr': string(val_w.layer_name),
+            'bias_attr': string(val_b.layer_name),
+            'name': string(node.layer_name),
+        }
+        node.fluid_code.add_layer(fluid_op,
+                                  inputs=val_x,
+                                  output=node,
+                                  param_attr=attr)
diff --git a/x2paddle/optimizer/onnx_optimizer.py b/x2paddle/optimizer/onnx_optimizer.py
index 28ffd0fdca60b353eb2881418f5d5cd1c507b5da..a8f851b6c5ea6140c53b91b5d20a6bbf3aa3046f 100644
--- a/x2paddle/optimizer/onnx_optimizer.py
+++ b/x2paddle/optimizer/onnx_optimizer.py
@@ -14,7 +14,6 @@
 
 # TODO useless node remove
 from x2paddle.op_mapper.onnx_op_mapper import ONNXOpMapper
-from x2paddle.core.util import *
 
 
 class ONNXOptimizer(object):