diff --git a/fluid/DeepASR/tools/profile.py b/fluid/DeepASR/tools/profile.py
index cb0227c33a25b1c38977f8485237f13d0351c36f..9d0b47694273345357726ad51062d01ff01b120d 100644
--- a/fluid/DeepASR/tools/profile.py
+++ b/fluid/DeepASR/tools/profile.py
@@ -169,7 +169,8 @@ def profile(args):
             outs = exe.run(fluid.default_main_program(),
                            feed={"feature": feature_t,
                                  "label": label_t},
-                           fetch_list=[avg_cost, accuracy],
+                           fetch_list=[avg_cost, accuracy]
+                           if args.print_train_acc else [],
                            return_numpy=False)
 
             if args.print_train_acc:
diff --git a/fluid/DeepASR/train.py b/fluid/DeepASR/train.py
index 9856dad7d56b47bf14c32a7d0ca0ec10b8ecf88f..b5d2239e94dd5ddcd79d0245e4b980d6cf5bfbf4 100644
--- a/fluid/DeepASR/train.py
+++ b/fluid/DeepASR/train.py
@@ -216,16 +216,17 @@ def train(args):
             label_t.set(labels, place)
             label_t.set_lod([lod])
 
-            cost, acc = exe.run(fluid.default_main_program(),
-                                feed={"feature": feature_t,
-                                      "label": label_t},
-                                fetch_list=[avg_cost, accuracy],
-                                return_numpy=False)
+            to_print = batch_id > 0 and (batch_id % args.print_per_batches == 0)
+            outs = exe.run(fluid.default_main_program(),
+                           feed={"feature": feature_t,
+                                 "label": label_t},
+                           fetch_list=[avg_cost, accuracy] if to_print else [],
+                           return_numpy=False)
 
-            if batch_id > 0 and (batch_id % args.print_per_batches == 0):
+            if to_print:
                 print("\nBatch %d, train cost: %f, train acc: %f" %
-                      (batch_id, lodtensor_to_ndarray(cost)[0],
-                       lodtensor_to_ndarray(acc)[0]))
+                      (batch_id, lodtensor_to_ndarray(outs[0])[0],
+                       lodtensor_to_ndarray(outs[1])[0]))
                 # save the latest checkpoint
                 if args.checkpoints != '':
                     model_path = os.path.join(args.checkpoints,
diff --git a/fluid/image_classification/caffe2fluid/README.md b/fluid/image_classification/caffe2fluid/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..279b4c6e57a785736a1c75928de8d45f4e4e956e
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/README.md
@@ -0,0 +1,25 @@
+### Caffe2Fluid
+This tool is used to convert a Caffe model to Fluid model
+
+### Howto
+1, Prepare caffepb.py in ./proto, two options provided
+    1) generate it from caffe.proto using protoc
+        bash ./proto/compile.sh
+
+    2) download one from github directly
+        cd proto/ && wget https://github.com/ethereon/caffe-tensorflow/blob/master/kaffe/caffe/caffepb.py
+
+2, Convert the caffe model using 'convert.py' which will generate a python script and a weight(in .npy) file
+
+3, Use the converted model to predict
+    see more detail info in 'tests/lenet/README.md'
+
+
+### Supported models
+- Lenet on mnist dataset
+
+- ResNets:(ResNet-50, ResNet-101, ResNet-152)
+    model addrs:(https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777)
+
+### Notes
+Some of this code come from here: https://github.com/ethereon/caffe-tensorflow
diff --git a/fluid/image_classification/caffe2fluid/convert.py b/fluid/image_classification/caffe2fluid/convert.py
new file mode 100755
index 0000000000000000000000000000000000000000..68a9e4f7e490a69c1b582d6fc14b2015bfdf9536
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/convert.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import numpy as np
+import argparse
+from kaffe import KaffeError, print_stderr
+
+from kaffe.paddle import Transformer
+
+
+def fatal_error(msg):
+    """ fatal error encounted
+    """
+    print_stderr(msg)
+    exit(-1)
+
+
+def validate_arguments(args):
+    """ validate args
+    """
+    if (args.data_output_path is not None) and (args.caffemodel is None):
+        fatal_error('No input data path provided.')
+    if (args.caffemodel is not None) and (args.data_output_path is None):
+        fatal_error('No output data path provided.')
+    if (args.code_output_path is None) and (args.data_output_path is None):
+        fatal_error('No output path specified.')
+
+
+def convert(def_path, caffemodel_path, data_output_path, code_output_path,
+            phase):
+    """ convert caffe model to tf/paddle models
+    """
+    try:
+        transformer = Transformer(def_path, caffemodel_path, phase=phase)
+        print_stderr('Converting data...')
+        if caffemodel_path is not None:
+            data = transformer.transform_data()
+            print_stderr('Saving data...')
+            with open(data_output_path, 'wb') as data_out:
+                np.save(data_out, data)
+        if code_output_path:
+            print_stderr('Saving source...')
+            with open(code_output_path, 'wb') as src_out:
+                src_out.write(transformer.transform_source())
+        print_stderr('Done.')
+    except KaffeError as err:
+        fatal_error('Error encountered: {}'.format(err))
+
+
+def main():
+    """ main
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('def_path', help='Model definition (.prototxt) path')
+    parser.add_argument('--caffemodel', help='Model data (.caffemodel) path')
+    parser.add_argument('--data-output-path', help='Converted data output path')
+    parser.add_argument(
+        '--code-output-path', help='Save generated source to this path')
+    parser.add_argument(
+        '-p',
+        '--phase',
+        default='test',
+        help='The phase to convert: test (default) or train')
+    args = parser.parse_args()
+    validate_arguments(args)
+    convert(args.def_path, args.caffemodel, args.data_output_path,
+            args.code_output_path, args.phase)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/fluid/image_classification/caffe2fluid/kaffe/__init__.py b/fluid/image_classification/caffe2fluid/kaffe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c11ce45c63112a75a2d15ac5d46fbbbf9f6a76e9
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/__init__.py
@@ -0,0 +1,5 @@
+from .graph import GraphBuilder, NodeMapper
+from .errors import KaffeError, print_stderr
+
+import os
+from . import paddle
diff --git a/fluid/image_classification/caffe2fluid/kaffe/caffe/__init__.py b/fluid/image_classification/caffe2fluid/kaffe/caffe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d53dee29d79721cfef275fcc0592fa8310acd34
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/caffe/__init__.py
@@ -0,0 +1 @@
+from .resolver import get_caffe_resolver, has_pycaffe
diff --git a/fluid/image_classification/caffe2fluid/kaffe/caffe/resolver.py b/fluid/image_classification/caffe2fluid/kaffe/caffe/resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fbd48d3ade5ab4b812210acf82be625871740cb
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/caffe/resolver.py
@@ -0,0 +1,61 @@
+import os
+import sys
+
+SHARED_CAFFE_RESOLVER = None
+
+
+def import_caffepb():
+    p = os.path.realpath(__file__)
+    p = os.path.dirname(p)
+    p = os.path.join(p, '../../proto')
+    sys.path.insert(0, p)
+    import caffepb
+    return caffepb
+
+
+class CaffeResolver(object):
+    def __init__(self):
+        self.import_caffe()
+
+    def import_caffe(self):
+        self.caffe = None
+        try:
+            # Try to import PyCaffe first
+            import caffe
+            self.caffe = caffe
+        except ImportError:
+            # Fall back to the protobuf implementation
+            self.caffepb = import_caffepb()
+            show_fallback_warning()
+        if self.caffe:
+            # Use the protobuf code from the imported distribution.
+            # This way, Caffe variants with custom layers will work.
+            self.caffepb = self.caffe.proto.caffe_pb2
+        self.NetParameter = self.caffepb.NetParameter
+
+    def has_pycaffe(self):
+        return self.caffe is not None
+
+
+def get_caffe_resolver():
+    global SHARED_CAFFE_RESOLVER
+    if SHARED_CAFFE_RESOLVER is None:
+        SHARED_CAFFE_RESOLVER = CaffeResolver()
+    return SHARED_CAFFE_RESOLVER
+
+
+def has_pycaffe():
+    return get_caffe_resolver().has_pycaffe()
+
+
+def show_fallback_warning():
+    msg = '''
+------------------------------------------------------------
+    WARNING: PyCaffe not found!
+    Falling back to a pure protocol buffer implementation.
+    * Conversions will be drastically slower.
+    * This backend is UNTESTED!
+------------------------------------------------------------
+
+'''
+    sys.stderr.write(msg)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/errors.py b/fluid/image_classification/caffe2fluid/kaffe/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..75eced5778a1f9abc8d6700ef5342b02462f6db3
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/errors.py
@@ -0,0 +1,34 @@
+import sys
+
+#debug level, can be 'warn', 'verbose'
+log_level = 'warn'
+
+
+class KaffeError(Exception):
+    pass
+
+
+def print_stderr(msg):
+    sys.stderr.write('%s\n' % msg)
+
+
+def debug(msg):
+    if log_level == 'verbose':
+        print_stderr('[DEBUG]' + msg)
+
+
+def notice(msg):
+    print_stderr('[NOTICE]' + msg)
+
+
+def warn(msg):
+    print_stderr('[WARNING]' + msg)
+
+
+def set_loglevel(level):
+    global log_level
+
+    if 'warn' != level and 'verbose' != level:
+        raise Exception('not supported log level[%s]' % (level))
+
+    log_level = level
diff --git a/fluid/image_classification/caffe2fluid/kaffe/graph.py b/fluid/image_classification/caffe2fluid/kaffe/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb751dffa1ca9cc19214bed12681312942046df6
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/graph.py
@@ -0,0 +1,302 @@
+from google.protobuf import text_format
+
+from .caffe import get_caffe_resolver
+from .errors import KaffeError, print_stderr
+from .layers import LayerAdapter, LayerType, NodeKind, NodeDispatch
+from .shapes import TensorShape
+
+
+class Node(object):
+    def __init__(self, name, kind, layer=None):
+        self.name = name
+        self.kind = kind
+        self.layer = LayerAdapter(layer, kind) if layer else None
+        self.parents = []
+        self.children = []
+        self.data = None
+        self.output_shape = None
+        self.metadata = {}
+
+    def add_parent(self, parent_node):
+        assert parent_node not in self.parents
+        self.parents.append(parent_node)
+        if self not in parent_node.children:
+            parent_node.children.append(self)
+
+    def add_child(self, child_node):
+        assert child_node not in self.children
+        self.children.append(child_node)
+        if self not in child_node.parents:
+            child_node.parents.append(self)
+
+    def get_only_parent(self):
+        if len(self.parents) != 1:
+            raise KaffeError('Node (%s) expected to have 1 parent. Found %s.' %
+                             (self, len(self.parents)))
+        return self.parents[0]
+
+    @property
+    def parameters(self):
+        if self.layer is not None:
+            return self.layer.parameters
+        return None
+
+    def __str__(self):
+        return '[%s] %s' % (self.kind, self.name)
+
+    def __repr__(self):
+        return '%s (0x%x)' % (self.name, id(self))
+
+
+class Graph(object):
+    def __init__(self, nodes=None, name=None):
+        self.nodes = nodes or []
+        self.node_lut = {node.name: node for node in self.nodes}
+        self.name = name
+
+    def add_node(self, node):
+        self.nodes.append(node)
+        self.node_lut[node.name] = node
+
+    def get_node(self, name):
+        try:
+            return self.node_lut[name]
+        except KeyError:
+            raise KaffeError('Layer not found: %s' % name)
+
+    def get_input_nodes(self):
+        return [node for node in self.nodes if len(node.parents) == 0]
+
+    def get_output_nodes(self):
+        return [node for node in self.nodes if len(node.children) == 0]
+
+    def topologically_sorted(self):
+        sorted_nodes = []
+        unsorted_nodes = list(self.nodes)
+        temp_marked = set()
+        perm_marked = set()
+
+        def visit(node):
+            if node in temp_marked:
+                raise KaffeError('Graph is not a DAG.')
+            if node in perm_marked:
+                return
+            temp_marked.add(node)
+            for child in node.children:
+                visit(child)
+            perm_marked.add(node)
+            temp_marked.remove(node)
+            sorted_nodes.insert(0, node)
+
+        while len(unsorted_nodes):
+            visit(unsorted_nodes.pop())
+        return sorted_nodes
+
+    def compute_output_shapes(self):
+        sorted_nodes = self.topologically_sorted()
+        for node in sorted_nodes:
+            node.output_shape = TensorShape(
+                *NodeKind.compute_output_shape(node))
+
+    def replaced(self, new_nodes):
+        return Graph(nodes=new_nodes, name=self.name)
+
+    def transformed(self, transformers):
+        graph = self
+        for transformer in transformers:
+            graph = transformer(graph)
+            if graph is None:
+                raise KaffeError('Transformer failed: {}'.format(transformer))
+            assert isinstance(graph, Graph)
+        return graph
+
+    def __contains__(self, key):
+        return key in self.node_lut
+
+    def __str__(self):
+        hdr = '{:<20} {:<30} {:>20} {:>20}'.format('Type', 'Name', 'Param',
+                                                   'Output')
+        s = [hdr, '-' * 94]
+        for node in self.topologically_sorted():
+            # If the node has learned parameters, display the first one's shape.
+            # In case of convolutions, this corresponds to the weights.
+            data_shape = node.data[0].shape if node.data else '--'
+            out_shape = node.output_shape or '--'
+            s.append('{:<20} {:<30} {:>20} {:>20}'.format(
+                node.kind, node.name, data_shape, tuple(out_shape)))
+        return '\n'.join(s)
+
+
+class GraphBuilder(object):
+    '''Constructs a model graph from a Caffe protocol buffer definition.'''
+
+    def __init__(self, def_path, phase='test'):
+        '''
+        def_path: Path to the model definition (.prototxt)
+        data_path: Path to the model data (.caffemodel)
+        phase: Either 'test' or 'train'. Used for filtering phase-specific nodes.
+        '''
+        self.def_path = def_path
+        self.phase = phase
+        self.load()
+
+    def load(self):
+        '''Load the layer definitions from the prototxt.'''
+        self.params = get_caffe_resolver().NetParameter()
+        with open(self.def_path, 'rb') as def_file:
+            text_format.Merge(def_file.read(), self.params)
+
+    def filter_layers(self, layers):
+        '''Filter out layers based on the current phase.'''
+        phase_map = {0: 'train', 1: 'test'}
+        filtered_layer_names = set()
+        filtered_layers = []
+        for layer in layers:
+            phase = self.phase
+            if len(layer.include):
+                phase = phase_map[layer.include[0].phase]
+            if len(layer.exclude):
+                phase = phase_map[1 - layer.include[0].phase]
+            exclude = (phase != self.phase)
+            # Dropout layers appear in a fair number of Caffe
+            # test-time networks. These are just ignored. We'll
+            # filter them out here.
+            if (not exclude) and (phase == 'test'):
+                exclude = (layer.type == LayerType.Dropout)
+            if not exclude:
+                filtered_layers.append(layer)
+                # Guard against dupes.
+                assert layer.name not in filtered_layer_names
+                filtered_layer_names.add(layer.name)
+        return filtered_layers
+
+    def make_node(self, layer):
+        '''Create a graph node for the given layer.'''
+        kind = NodeKind.map_raw_kind(layer.type)
+        if kind is None:
+            raise KaffeError('Unknown layer type encountered: %s' % layer.type)
+        # We want to use the layer's top names (the "output" names), rather than the
+        # name attribute, which is more of readability thing than a functional one.
+        # Other layers will refer to a node by its "top name".
+        return Node(layer.name, kind, layer=layer)
+
+    def make_input_nodes(self):
+        '''
+        Create data input nodes.
+
+        This method is for old-style inputs, where the input specification
+        was not treated as a first-class layer in the prototext.
+        Newer models use the "Input layer" type.
+        '''
+        nodes = [Node(name, NodeKind.Data) for name in self.params.input]
+        if len(nodes):
+            input_dim = map(int, self.params.input_dim)
+            if not input_dim:
+                if len(self.params.input_shape) > 0:
+                    input_dim = map(int, self.params.input_shape[0].dim)
+                else:
+                    raise KaffeError('Dimensions for input not specified.')
+            for node in nodes:
+                node.output_shape = tuple(input_dim)
+        return nodes
+
+    def build(self):
+        '''
+        Builds the graph from the Caffe layer definitions.
+        '''
+        # Get the layers
+        layers = self.params.layers or self.params.layer
+        # Filter out phase-excluded layers
+        layers = self.filter_layers(layers)
+        # Get any separately-specified input layers
+        nodes = self.make_input_nodes()
+        nodes += [self.make_node(layer) for layer in layers]
+        # Initialize the graph
+        graph = Graph(nodes=nodes, name=self.params.name)
+        # Connect the nodes
+        #
+        # A note on layers and outputs:
+        # In Caffe, each layer can produce multiple outputs ("tops") from a set of inputs
+        # ("bottoms"). The bottoms refer to other layers' tops. The top can rewrite a bottom
+        # (in case of in-place operations). Note that the layer's name is not used for establishing
+        # any connectivity. It's only used for data association. By convention, a layer with a
+        # single top will often use the same name (although this is not required).
+        #
+        # The current implementation only supports single-output nodes (note that a node can still
+        # have multiple children, since multiple child nodes can refer to the single top's name).
+        node_outputs = {}
+        for layer in layers:
+            node = graph.get_node(layer.name)
+            for input_name in layer.bottom:
+                assert input_name != layer.name
+                parent_node = node_outputs.get(input_name)
+                if (parent_node is None) or (parent_node == node):
+                    parent_node = graph.get_node(input_name)
+                node.add_parent(parent_node)
+            if len(layer.top) > 1:
+                raise KaffeError('Multiple top nodes are not supported.')
+            for output_name in layer.top:
+                if output_name == layer.name:
+                    # Output is named the same as the node. No further action required.
+                    continue
+                # There are two possibilities here:
+                #
+                # Case 1: output_name refers to another node in the graph.
+                # This is an "in-place operation" that overwrites an existing node.
+                # This would create a cycle in the graph. We'll undo the in-placing
+                # by substituting this node wherever the overwritten node is referenced.
+                #
+                # Case 2: output_name violates the convention layer.name == output_name.
+                # Since we are working in the single-output regime, we will can rename it to
+                # match the layer name.
+                #
+                # For both cases, future references to this top re-routes to this node.
+                node_outputs[output_name] = node
+
+        graph.compute_output_shapes()
+        return graph
+
+
+class NodeMapper(NodeDispatch):
+    def __init__(self, graph):
+        self.graph = graph
+
+    def map(self):
+        nodes = self.graph.topologically_sorted()
+        # Remove input nodes - we'll handle them separately.
+        input_nodes = self.graph.get_input_nodes()
+        nodes = [t for t in nodes if t not in input_nodes]
+        # Decompose DAG into chains.
+        chains = []
+        for node in nodes:
+            attach_to_chain = None
+            if len(node.parents) == 1:
+                parent = node.get_only_parent()
+                for chain in chains:
+                    if chain[-1] == parent:
+                        # Node is part of an existing chain.
+                        attach_to_chain = chain
+                        break
+            if attach_to_chain is None:
+                # Start a new chain for this node.
+                attach_to_chain = []
+                chains.append(attach_to_chain)
+            attach_to_chain.append(node)
+        # Map each chain.
+        mapped_chains = []
+        for chain in chains:
+            mapped_chains.append(self.map_chain(chain))
+        return self.commit(mapped_chains)
+
+    def map_chain(self, chain):
+        return [self.map_node(node) for node in chain]
+
+    def map_node(self, node):
+        map_func = self.get_handler(node.kind, 'map')
+        mapped_node = map_func(node)
+        assert mapped_node is not None
+        mapped_node.node = node
+        return mapped_node
+
+    def commit(self, mapped_chains):
+        raise NotImplementedError('Must be implemented by subclass.')
diff --git a/fluid/image_classification/caffe2fluid/kaffe/layers.py b/fluid/image_classification/caffe2fluid/kaffe/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6be35ed727fed76a1c96017455bdaa354ace9f97
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/layers.py
@@ -0,0 +1,152 @@
+import re
+import numbers
+from collections import namedtuple
+
+from .shapes import *
+
+LAYER_DESCRIPTORS = {
+
+    # Caffe Types
+    'AbsVal': shape_identity,
+    'Accuracy': shape_scalar,
+    'ArgMax': shape_not_implemented,
+    'BatchNorm': shape_identity,
+    'BNLL': shape_not_implemented,
+    'Concat': shape_concat,
+    'ContrastiveLoss': shape_scalar,
+    'Convolution': shape_convolution,
+    'Deconvolution': shape_not_implemented,
+    'Data': shape_data,
+    'Dropout': shape_identity,
+    'DummyData': shape_data,
+    'EuclideanLoss': shape_scalar,
+    'Eltwise': shape_identity,
+    'Exp': shape_identity,
+    'Flatten': shape_not_implemented,
+    'HDF5Data': shape_data,
+    'HDF5Output': shape_identity,
+    'HingeLoss': shape_scalar,
+    'Im2col': shape_not_implemented,
+    'ImageData': shape_data,
+    'InfogainLoss': shape_scalar,
+    'InnerProduct': shape_inner_product,
+    'Input': shape_data,
+    'LRN': shape_identity,
+    'MemoryData': shape_mem_data,
+    'MultinomialLogisticLoss': shape_scalar,
+    'MVN': shape_not_implemented,
+    'Pooling': shape_pool,
+    'Power': shape_identity,
+    'ReLU': shape_identity,
+    'Scale': shape_identity,
+    'Sigmoid': shape_identity,
+    'SigmoidCrossEntropyLoss': shape_scalar,
+    'Silence': shape_not_implemented,
+    'Softmax': shape_identity,
+    'SoftmaxWithLoss': shape_scalar,
+    'Split': shape_not_implemented,
+    'Slice': shape_not_implemented,
+    'TanH': shape_identity,
+    'WindowData': shape_not_implemented,
+    'Threshold': shape_identity,
+}
+
+LAYER_TYPES = LAYER_DESCRIPTORS.keys()
+
+LayerType = type('LayerType', (), {t: t for t in LAYER_TYPES})
+
+
+class NodeKind(LayerType):
+    @staticmethod
+    def map_raw_kind(kind):
+        if kind in LAYER_TYPES:
+            return kind
+        return None
+
+    @staticmethod
+    def compute_output_shape(node):
+        try:
+            val = LAYER_DESCRIPTORS[node.kind](node)
+            return val
+        except NotImplementedError:
+            raise KaffeError(
+                'Output shape computation not implemented for type: %s' %
+                node.kind)
+
+
+class NodeDispatchError(KaffeError):
+
+    pass
+
+
+class NodeDispatch(object):
+    @staticmethod
+    def get_handler_name(node_kind):
+        if len(node_kind) <= 4:
+            # A catch-all for things like ReLU and tanh
+            return node_kind.lower()
+        # Convert from CamelCase to under_scored
+        name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', node_kind)
+        return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
+
+    def get_handler(self, node_kind, prefix):
+        name = self.get_handler_name(node_kind)
+        name = '_'.join((prefix, name))
+        try:
+            return getattr(self, name)
+        except AttributeError:
+            raise NodeDispatchError(
+                'No handler found for node kind: %s (expected: %s)' %
+                (node_kind, name))
+
+
+class LayerAdapter(object):
+    def __init__(self, layer, kind):
+        self.layer = layer
+        self.kind = kind
+
+    @property
+    def parameters(self):
+        name = NodeDispatch.get_handler_name(self.kind)
+        name = '_'.join((name, 'param'))
+        try:
+            return getattr(self.layer, name)
+        except AttributeError:
+            raise NodeDispatchError(
+                'Caffe parameters not found for layer kind: %s' % (self.kind))
+
+    @staticmethod
+    def get_kernel_value(scalar, repeated, idx, default=None):
+        if scalar:
+            return scalar
+        if repeated:
+            if isinstance(repeated, numbers.Number):
+                return repeated
+            if len(repeated) == 1:
+                # Same value applies to all spatial dimensions
+                return int(repeated[0])
+            assert idx < len(repeated)
+            # Extract the value for the given spatial dimension
+            return repeated[idx]
+        if default is None:
+            raise ValueError('Unable to determine kernel parameter!')
+        return default
+
+    @property
+    def kernel_parameters(self):
+        assert self.kind in (NodeKind.Convolution, NodeKind.Pooling)
+        params = self.parameters
+        k_h = self.get_kernel_value(params.kernel_h, params.kernel_size, 0)
+        k_w = self.get_kernel_value(params.kernel_w, params.kernel_size, 1)
+        s_h = self.get_kernel_value(
+            params.stride_h, params.stride, 0, default=1)
+        s_w = self.get_kernel_value(
+            params.stride_w, params.stride, 1, default=1)
+        p_h = self.get_kernel_value(params.pad_h, params.pad, 0, default=0)
+        p_w = self.get_kernel_value(params.pad_h, params.pad, 1, default=0)
+        return KernelParameters(k_h, k_w, s_h, s_w, p_h, p_w)
+
+
+KernelParameters = namedtuple('KernelParameters', [
+    'kernel_h', 'kernel_w', 'stride_h', 'stride_w', 'pad_h', 'pad_w'
+])
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/__init__.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..685b653c392312ac3868b04f9dfb01b80535f677
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/__init__.py
@@ -0,0 +1,2 @@
+from .transformer import Transformer
+from .network import Network
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..620a84e8f1289672151f1f280559a56b37995ce0
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
@@ -0,0 +1,260 @@
+import math
+import os
+import numpy as np
+
+
+def import_fluid():
+    import paddle.v2.fluid as fluid
+    return fluid
+
+
+def layer(op):
+    '''Decorator for composable network layers.'''
+
+    def layer_decorated(self, *args, **kwargs):
+        # Automatically set a name if not provided.
+        name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
+        # Figure out the layer inputs.
+        if len(self.terminals) == 0:
+            raise RuntimeError('No input variables found for layer %s.' % name)
+        elif len(self.terminals) == 1:
+            layer_input = self.terminals[0]
+        else:
+            layer_input = list(self.terminals)
+        # Perform the operation and get the output.
+        layer_output = op(self, layer_input, *args, **kwargs)
+        # Add to layer LUT.
+        self.layers[name] = layer_output
+        # This output is now the input for the next layer.
+        self.feed(layer_output)
+        # Return self for chained calls.
+        return self
+
+    return layer_decorated
+
+
+class Network(object):
+    def __init__(self, inputs, trainable=True):
+        # The input nodes for this network
+        self.inputs = inputs
+        # The current list of terminal nodes
+        self.terminals = []
+        # Mapping from layer names to layers
+        self.layers = dict(inputs)
+        # If true, the resulting variables are set as trainable
+        self.trainable = trainable
+        # Switch variable for dropout
+        self.paddle_env = None
+        self.setup()
+
+    def setup(self):
+        '''Construct the network. '''
+        raise NotImplementedError('Must be implemented by the subclass.')
+
+    def load(self, data_path, exe=None, place=None, ignore_missing=False):
+        '''Load network weights.
+        data_path: The path to the numpy-serialized network weights
+        ignore_missing: If true, serialized weights for missing layers are ignored.
+        '''
+        fluid = import_fluid()
+        #load fluid mode directly
+        if os.path.isdir(data_path):
+            assert (exe is not None), \
+                'must provide a executor to load fluid model'
+            fluid.io.load_persistables_if_exist(executor=exe, dirname=data_path)
+            return True
+
+        #load model from a npy file
+        if exe is None or place is None:
+            if self.paddle_env is None:
+                place = fluid.CPUPlace()
+                exe = fluid.Executor(place)
+                self.paddle_env = {'place': place, 'exe': exe}
+                exe = exe.run(fluid.default_startup_program())
+            else:
+                place = self.paddle_env['place']
+                exe = self.paddle_env['exe']
+
+        data_dict = np.load(data_path).item()
+        for op_name in data_dict:
+            layer = self.layers[op_name]
+            for param_name, data in data_dict[op_name].iteritems():
+                try:
+                    name = '%s_%s' % (op_name, param_name)
+                    v = fluid.global_scope().find_var(name)
+                    w = v.get_tensor()
+                    w.set(data, place)
+                except ValueError:
+                    if not ignore_missing:
+                        raise
+        return True
+
+    def feed(self, *args):
+        '''Set the input(s) for the next operation by replacing the terminal nodes.
+        The arguments can be either layer names or the actual layers.
+        '''
+        assert len(args) != 0
+        self.terminals = []
+        for fed_layer in args:
+            if isinstance(fed_layer, basestring):
+                try:
+                    fed_layer = self.layers[fed_layer]
+                except KeyError:
+                    raise KeyError('Unknown layer name fed: %s' % fed_layer)
+            self.terminals.append(fed_layer)
+        return self
+
+    def get_output(self):
+        '''Returns the current network output.'''
+        return self.terminals[-1]
+
+    def get_unique_name(self, prefix):
+        '''Returns an index-suffixed unique name for the given prefix.
+        This is used for auto-generating layer names based on the type-prefix.
+        '''
+        ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
+        return '%s_%d' % (prefix, ident)
+
+    @layer
+    def conv(self,
+             input,
+             k_h,
+             k_w,
+             c_o,
+             s_h,
+             s_w,
+             name,
+             relu=True,
+             padding=None,
+             group=1,
+             biased=True):
+        if padding is None:
+            padding = [0, 0]
+
+        # Get the number of channels in the input
+        c_i, h_i, w_i = input.shape[1:]
+
+        # Verify that the grouping parameter is valid
+        assert c_i % group == 0
+        assert c_o % group == 0
+
+        fluid = import_fluid()
+        prefix = name + '_'
+        output = fluid.layers.conv2d(
+            input=input,
+            filter_size=[k_h, k_w],
+            num_filters=c_o,
+            stride=[s_h, s_w],
+            padding=padding,
+            groups=group,
+            param_attr=fluid.ParamAttr(name=prefix + "weights"),
+            bias_attr=fluid.ParamAttr(name=prefix + "biases"),
+            act="relu" if relu is True else None)
+        return output
+
+    @layer
+    def relu(self, input, name):
+        fluid = import_fluid()
+        output = fluid.layers.relu(x=input)
+        return output
+
+    @layer
+    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
+        if padding is None:
+            padding = [0, 0]
+
+        # Get the number of channels in the input
+        h_i, w_i = input.shape[2:]
+        fluid = import_fluid()
+        output = fluid.layers.pool2d(
+            input=input,
+            pool_size=[k_h, k_w],
+            pool_stride=[s_h, s_w],
+            pool_padding=padding,
+            pool_type='max')
+        return output
+
+    @layer
+    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
+        if padding is None:
+            padding = [0, 0]
+
+        # Get the number of channels in the input
+        h_i, w_i = input.shape[2:]
+        fluid = import_fluid()
+        output = fluid.layers.pool2d(
+            input=input,
+            pool_size=[k_h, k_w],
+            pool_stride=[s_h, s_w],
+            pool_padding=padding,
+            pool_type='avg')
+        return output
+
+    @layer
+    def lrn(self, input, radius, alpha, beta, name, bias=1.0):
+        raise Exception('lrn() not implemented yet')
+
+    @layer
+    def concat(self, inputs, axis, name):
+        fluid = import_fluid()
+        output = fluid.layers.concat(input=inputs, axis=axis)
+        return output
+
+    @layer
+    def add(self, inputs, name):
+        fluid = import_fluid()
+        output = inputs[0]
+        for i in inputs[1:]:
+            output = fluid.layers.elementwise_add(x=output, y=i)
+        return output
+
+    @layer
+    def fc(self, input, num_out, name, relu=True, act=None):
+        fluid = import_fluid()
+
+        if act is None:
+            act = 'relu' if relu is True else None
+
+        prefix = name + '_'
+        output = fluid.layers.fc(
+            name=name,
+            input=input,
+            size=num_out,
+            act=act,
+            param_attr=fluid.ParamAttr(name=prefix + 'weights'),
+            bias_attr=fluid.ParamAttr(name=prefix + 'biases'))
+        return output
+
+    @layer
+    def softmax(self, input, name):
+        fluid = import_fluid()
+        output = fluid.layers.softmax(x=input, name=name)
+        return output
+
+    @layer
+    def batch_normalization(self, input, name, scale_offset=True, relu=False):
+        # NOTE: Currently, only inference is supported
+        fluid = import_fluid()
+        prefix = name + '_'
+        param_attr = None if scale_offset is False else fluid.ParamAttr(
+            name=prefix + 'scale')
+        bias_attr = None if scale_offset is False else fluid.ParamAttr(
+            name=prefix + 'offset')
+        mean_name = prefix + 'mean'
+        variance_name = prefix + 'variance'
+        output = fluid.layers.batch_norm(
+            name=name,
+            input=input,
+            is_test=True,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            moving_mean_name=mean_name,
+            moving_variance_name=variance_name,
+            epsilon=1e-5,
+            act='relu' if relu is True else None)
+
+        return output
+
+    @layer
+    def dropout(self, input, keep_prob, name):
+        raise Exception('dropout() not implemented yet')
diff --git a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..92b9d32a3a755d8e6a2a8739cc3f42f9c8564b40
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
@@ -0,0 +1,353 @@
+import numpy as np
+
+from ..errors import KaffeError, print_stderr
+from ..graph import GraphBuilder, NodeMapper
+from ..layers import NodeKind
+from ..transformers import (DataInjector, DataReshaper, NodeRenamer, ReLUFuser,
+                            BatchNormScaleBiasFuser, BatchNormPreprocessor,
+                            ParameterNamer)
+from . import network
+
+
+def get_padding_type(kernel_params, input_shape, output_shape):
+    '''Translates Caffe's numeric padding to one of ('SAME', 'VALID').
+    Caffe supports arbitrary padding values, while TensorFlow only
+    supports 'SAME' and 'VALID' modes. So, not all Caffe paddings
+    can be translated to TensorFlow. There are some subtleties to
+    how the padding edge-cases are handled. These are described here:
+    https://github.com/Yangqing/caffe2/blob/master/caffe2/proto/caffe2_legacy.proto
+    '''
+    k_h, k_w, s_h, s_w, p_h, p_w = kernel_params
+    if p_h * p_w > 0:
+        return [p_h, p_w]
+    else:
+        return None
+
+
+class TensorFlowNode(object):
+    '''An intermediate representation for TensorFlow operations.'''
+
+    def __init__(self, op, *args, **kwargs):
+        # A string corresponding to the TensorFlow operation
+        self.op = op
+        # Positional arguments for the operation
+        self.args = args
+        # Keyword arguments for the operation
+        self.kwargs = list(kwargs.items())
+        # The source Caffe node
+        self.node = None
+
+    def format(self, arg):
+        '''Returns a string representation for the given value.'''
+        return "'%s'" % arg if isinstance(arg, basestring) else str(arg)
+
+    def pair(self, key, value):
+        '''Returns key=formatted(value).'''
+        return '%s=%s' % (key, self.format(value))
+
+    def emit(self):
+        '''Emits the Python source for this node.'''
+        # Format positional arguments
+        args = map(self.format, self.args)
+        # Format any keyword arguments
+        if self.kwargs:
+            args += [self.pair(k, v) for k, v in self.kwargs]
+        # Set the node name
+        args.append(self.pair('name', self.node.name))
+        args = ', '.join(args)
+        return '%s(%s)' % (self.op, args)
+
+
+class MaybeActivated(object):
+    def __init__(self, node, default=True):
+        self.inject_kwargs = {}
+        if node.metadata.get('relu', False) != default:
+            self.inject_kwargs['relu'] = not default
+
+    def __call__(self, *args, **kwargs):
+        kwargs.update(self.inject_kwargs)
+        return TensorFlowNode(*args, **kwargs)
+
+
+class TensorFlowMapper(NodeMapper):
+    def get_kernel_params(self, node):
+        kernel_params = node.layer.kernel_parameters
+        input_shape = node.get_only_parent().output_shape
+        padding = get_padding_type(kernel_params, input_shape,
+                                   node.output_shape)
+        # Only emit the padding if it's not the default value.
+        padding = {'padding': padding} if padding is not None else {}
+        return (kernel_params, padding)
+
+    def map_convolution(self, node):
+        (kernel_params, kwargs) = self.get_kernel_params(node)
+        h = kernel_params.kernel_h
+        w = kernel_params.kernel_w
+        c_o = node.output_shape[1]
+        c_i = node.parents[0].output_shape[1]
+        group = node.parameters.group
+        if group != 1:
+            kwargs['group'] = group
+        if not node.parameters.bias_term:
+            kwargs['biased'] = False
+        assert kernel_params.kernel_h == h
+        assert kernel_params.kernel_w == w
+        return MaybeActivated(node)(
+            'conv', kernel_params.kernel_h, kernel_params.kernel_w, c_o,
+            kernel_params.stride_h, kernel_params.stride_w, **kwargs)
+
+    def map_relu(self, node):
+        return TensorFlowNode('relu')
+
+    def map_pooling(self, node):
+        pool_type = node.parameters.pool
+        if pool_type == 0:
+            pool_op = 'max_pool'
+        elif pool_type == 1:
+            pool_op = 'avg_pool'
+        else:
+            # Stochastic pooling, for instance.
+            raise KaffeError('Unsupported pooling type.')
+        (kernel_params, padding) = self.get_kernel_params(node)
+        return TensorFlowNode(pool_op, kernel_params.kernel_h,
+                              kernel_params.kernel_w, kernel_params.stride_h,
+                              kernel_params.stride_w, **padding)
+
+    def map_inner_product(self, node):
+        #TODO: Axis
+        assert node.parameters.axis == 1
+        #TODO: Unbiased
+        assert node.parameters.bias_term == True
+        return MaybeActivated(node)('fc', node.parameters.num_output)
+
+    def map_softmax(self, node):
+        return TensorFlowNode('softmax')
+
+    def map_lrn(self, node):
+        params = node.parameters
+        # The window size must be an odd value. For a window
+        # size of (2*n+1), TensorFlow defines depth_radius = n.
+        assert params.local_size % 2 == 1
+        # Caffe scales by (alpha/(2*n+1)), whereas TensorFlow
+        # just scales by alpha (as does Krizhevsky's paper).
+        # We'll account for that here.
+        alpha = params.alpha / float(params.local_size)
+        return TensorFlowNode('lrn',
+                              int(params.local_size / 2), alpha, params.beta)
+
+    def map_concat(self, node):
+        return TensorFlowNode('concat', node.parameters.axis)
+
+    def map_dropout(self, node):
+        return TensorFlowNode('dropout', node.parameters.dropout_ratio)
+
+    def map_batch_norm(self, node):
+        scale_offset = len(node.data) == 4
+        kwargs = {} if scale_offset else {'scale_offset': False}
+        return MaybeActivated(
+            node, default=False)('batch_normalization', **kwargs)
+
+    def map_eltwise(self, node):
+        operations = {0: 'multiply', 1: 'add', 2: 'max'}
+        op_code = node.parameters.operation
+        try:
+            return TensorFlowNode(operations[op_code])
+        except KeyError:
+            raise KaffeError('Unknown elementwise operation: {}'.format(
+                op_code))
+
+    def commit(self, chains):
+        return chains
+
+
+class TensorFlowEmitter(object):
+    def __init__(self, tab=None):
+        self.tab = tab or ' ' * 4
+        self.prefix = ''
+        self.net_name = ''
+
+    def indent(self):
+        self.prefix += self.tab
+
+    def outdent(self):
+        self.prefix = self.prefix[:-len(self.tab)]
+
+    def statement(self, s):
+        return self.prefix + s + '\n'
+
+    def emit_imports(self):
+        import inspect
+        codes = []
+        codes.append(
+            '### generated by caffe2fluid, your net is in class "%s" ###\n' %
+            (self.net_name))
+        network_source = inspect.getsource(network)
+        codes.append(network_source + '\n')
+        return self.statement('\n'.join(codes))
+
+    def emit_class_def(self, name):
+        return self.statement('class %s(Network):' % (name))
+
+    def emit_setup_def(self):
+        return self.statement('def setup(self):')
+
+    def emit_convert_def(self, input_nodes):
+        def data_layer_def(name, shape, dtype=None):
+            if dtype is None:
+                dtype = 'float32'
+
+            layer_var = name + '_layer'
+            shape = [str(s) for s in shape[1:]]
+            layer_def = '%s = fluid.layers.data(name="%s", shape=[%s], dtype="%s")'\
+                    % (layer_var, name, ','.join(shape), dtype)
+            return layer_var, layer_def
+
+        codes = []
+        inputs = {}
+        for n in input_nodes:
+            name = n.name
+            layer_var, layer_def = data_layer_def(n.name, n.output_shape)
+            codes.append(layer_def)
+            inputs[name] = layer_var
+
+        input_dict = ','.join(['"%s": %s' % (n, l) for n, l in inputs.items()])
+
+        codes.append('feed_data = {' + input_dict + '}')
+        codes.append('net = cls(feed_data)')
+
+        codes.append("place = fluid.CPUPlace()")
+        codes.append("exe = fluid.Executor(place)")
+        codes.append("exe.run(fluid.default_startup_program())")
+        codes.append("net.load(data_path=npy_model, exe=exe, place=place)")
+        codes.append(
+            "fluid.io.save_persistables(executor=exe, dirname=fluid_path)")
+
+        self.outdent()
+        func_def = self.statement('@classmethod')
+        func_def += self.statement('def convert(cls, npy_model, fluid_path):')
+        self.indent()
+        func_def += self.statement('import paddle.v2.fluid as fluid')
+        for l in codes:
+            func_def += self.statement(l)
+        return '\n\n' + func_def
+
+    def emit_main_def(self, name):
+        if name is None:
+            return ''
+
+        self.prefix = ''
+        main_def = self.statement('if __name__ == "__main__":')
+        self.indent()
+        main_def += self.statement("#usage: python xxxnet.py xxx.npy ./model\n")
+        main_def += self.statement("import sys")
+        main_def += self.statement("npy_weight = sys.argv[1]")
+        main_def += self.statement("fluid_model = sys.argv[2]")
+        main_def += self.statement("%s.convert(npy_weight, fluid_model)" %
+                                   (name))
+        main_def += self.statement("exit(0)")
+        return '\n\n' + main_def
+
+    def emit_parents(self, chain):
+        assert len(chain)
+        s = 'self.feed('
+        sep = ', \n' + self.prefix + (' ' * len(s))
+        s += sep.join(
+            ["'%s'" % parent.name for parent in chain[0].node.parents])
+        return self.statement(s + ')')
+
+    def emit_node(self, node):
+        return self.statement('self.' + node.emit())
+
+    def emit(self, name, chains, input_nodes=None):
+        self.net_name = name
+        s = self.emit_imports()
+        s += self.emit_class_def(name)
+        self.indent()
+        s += self.emit_setup_def()
+        self.indent()
+        blocks = []
+        for chain in chains:
+            b = ''
+            b += self.emit_parents(chain)
+            for node in chain:
+                b += self.emit_node(node)
+            blocks.append(b[:-1])
+        s = s + '\n\n'.join(blocks)
+        s += self.emit_convert_def(input_nodes)
+        s += self.emit_main_def(name)
+        return s
+
+
+class Transformer(object):
+    def __init__(self, def_path, data_path, verbose=True, phase='test'):
+        self.verbose = verbose
+        self.phase = phase
+        self.load(def_path, data_path, phase)
+        self.params = None
+        self.source = None
+
+    def load(self, def_path, data_path, phase):
+        # Build the graph
+        graph = GraphBuilder(def_path, phase).build()
+
+        if data_path is not None:
+            # Load and associate learned parameters
+            graph = DataInjector(def_path, data_path)(graph)
+
+        # Transform the graph
+        transformers = [
+            # Fuse split batch normalization layers
+            BatchNormScaleBiasFuser(),
+
+            # Fuse ReLUs
+            # TODO: Move non-linearity application to layer wrapper, allowing
+            # any arbitrary operation to be optionally activated.
+            ReLUFuser(allowed_parent_types=[
+                NodeKind.Convolution, NodeKind.InnerProduct, NodeKind.BatchNorm
+            ]),
+
+            # Rename nodes
+            # Slashes are used for scoping in TensorFlow. Replace slashes
+            # in node names with underscores.
+            # (Caffe's GoogLeNet implementation uses slashes)
+            NodeRenamer(lambda node: node.name.replace('/', '_'))
+        ]
+        self.graph = graph.transformed(transformers)
+
+        # Display the graph
+        if self.verbose:
+            print_stderr(self.graph)
+
+    def transform_data(self):
+        if self.params is None:
+            transformers = [
+                # Reshape the parameters to TensorFlow's ordering
+                DataReshaper({
+                    # (c_o, c_i, h, w) -> (h, w, c_i, c_o) for TF
+                    NodeKind.Convolution: (0, 1, 2, 3),
+
+                    # (c_o, c_i) -> (c_i, c_o)
+                    NodeKind.InnerProduct: (1, 0)
+                }),
+
+                # Pre-process batch normalization data
+                BatchNormPreprocessor(),
+
+                # Convert parameters to dictionaries
+                ParameterNamer(),
+            ]
+            self.graph = self.graph.transformed(transformers)
+            self.params = {
+                node.name: node.data
+                for node in self.graph.nodes if node.data
+            }
+        return self.params
+
+    def transform_source(self):
+        if self.source is None:
+            mapper = TensorFlowMapper(self.graph)
+            chains = mapper.map()
+            emitter = TensorFlowEmitter()
+            input_nodes = self.graph.get_input_nodes()
+            self.source = emitter.emit(self.graph.name, chains, input_nodes)
+        return self.source
diff --git a/fluid/image_classification/caffe2fluid/kaffe/shapes.py b/fluid/image_classification/caffe2fluid/kaffe/shapes.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8124730c66eaecb85f7aff58e08f6dc16668343
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/shapes.py
@@ -0,0 +1,88 @@
+import math
+from collections import namedtuple
+
+from .errors import KaffeError
+
+TensorShape = namedtuple('TensorShape',
+                         ['batch_size', 'channels', 'height', 'width'])
+
+
+def get_filter_output_shape(i_h, i_w, params, round_func):
+    o_h = (i_h + 2 * params.pad_h - params.kernel_h
+           ) / float(params.stride_h) + 1
+    o_w = (i_w + 2 * params.pad_w - params.kernel_w
+           ) / float(params.stride_w) + 1
+    return (int(round_func(o_h)), int(round_func(o_w)))
+
+
+def get_strided_kernel_output_shape(node, round_func):
+    assert node.layer is not None
+    input_shape = node.get_only_parent().output_shape
+    o_h, o_w = get_filter_output_shape(input_shape.height, input_shape.width,
+                                       node.layer.kernel_parameters, round_func)
+    params = node.layer.parameters
+    has_c_o = hasattr(params, 'num_output')
+    c = params.num_output if has_c_o else input_shape.channels
+    return TensorShape(input_shape.batch_size, c, o_h, o_w)
+
+
+def shape_not_implemented(node):
+    raise NotImplementedError
+
+
+def shape_identity(node):
+    assert len(node.parents) > 0
+    return node.parents[0].output_shape
+
+
+def shape_scalar(node):
+    return TensorShape(1, 1, 1, 1)
+
+
+def shape_data(node):
+    if node.output_shape:
+        # Old-style input specification
+        return node.output_shape
+    try:
+        # New-style input specification
+        return map(int, node.parameters.shape[0].dim)
+    except:
+        # We most likely have a data layer on our hands. The problem is,
+        # Caffe infers the dimensions of the data from the source (eg: LMDB).
+        # We want to avoid reading datasets here. Fail for now.
+        # This can be temporarily fixed by transforming the data layer to
+        # Caffe's "input" layer (as is usually used in the "deploy" version).
+        # TODO: Find a better solution for this.
+        raise KaffeError('Cannot determine dimensions of data layer.\n'
+                         'See comments in function shape_data for more info.')
+
+
+def shape_mem_data(node):
+    params = node.parameters
+    return TensorShape(params.batch_size, params.channels, params.height,
+                       params.width)
+
+
+def shape_concat(node):
+    axis = node.layer.parameters.axis
+    output_shape = None
+    for parent in node.parents:
+        if output_shape is None:
+            output_shape = list(parent.output_shape)
+        else:
+            output_shape[axis] += parent.output_shape[axis]
+    return tuple(output_shape)
+
+
+def shape_convolution(node):
+    return get_strided_kernel_output_shape(node, math.floor)
+
+
+def shape_pool(node):
+    return get_strided_kernel_output_shape(node, math.ceil)
+
+
+def shape_inner_product(node):
+    input_shape = node.get_only_parent().output_shape
+    return TensorShape(input_shape.batch_size, node.layer.parameters.num_output,
+                       1, 1)
diff --git a/fluid/image_classification/caffe2fluid/kaffe/transformers.py b/fluid/image_classification/caffe2fluid/kaffe/transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d300ca9c90672c3f3a3dbf7a14e48db6bb48f70
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/kaffe/transformers.py
@@ -0,0 +1,303 @@
+'''
+A collection of graph transforms.
+
+A transformer is a callable that accepts a graph and returns a transformed version.
+'''
+import os
+import numpy as np
+
+from .caffe import get_caffe_resolver, has_pycaffe
+from .errors import KaffeError, debug, notice, warn
+from .layers import NodeKind
+
+
+class DataInjector(object):
+    '''
+    Associates parameters loaded from a .caffemodel file with their corresponding nodes.
+    '''
+
+    def __init__(self, def_path, data_path):
+        # The .prototxt file defining the graph
+        self.def_path = def_path
+        # The .caffemodel file containing the learned parameters
+        self.data_path = data_path
+        # Set to true if the fallback protocol-buffer based backend was used
+        self.did_use_pb = False
+        # A list containing (layer name, parameters) tuples
+        self.params = None
+        # Load the parameters
+        self.load()
+
+    def load(self):
+        if has_pycaffe():
+            self.load_using_caffe()
+        else:
+            self.load_using_pb()
+
+    def load_using_caffe(self):
+        caffe = get_caffe_resolver().caffe
+        net = caffe.Net(self.def_path, self.data_path, caffe.TEST)
+        data = lambda blob: blob.data
+        self.params = [(k, map(data, v)) for k, v in net.params.items()]
+
+    def load_using_pb(self):
+        data = get_caffe_resolver().NetParameter()
+        data.MergeFromString(open(self.data_path, 'rb').read())
+        pair = lambda layer: (layer.name, self.normalize_pb_data(layer))
+        layers = data.layers or data.layer
+        self.params = [pair(layer) for layer in layers if layer.blobs]
+        self.did_use_pb = True
+
+    def normalize_pb_data(self, layer):
+        transformed = []
+        for blob in layer.blobs:
+            if len(blob.shape.dim):
+                dims = blob.shape.dim
+                c_o, c_i, h, w = map(int, [1] * (4 - len(dims)) + list(dims))
+            else:
+                c_o = blob.num
+                c_i = blob.channels
+                h = blob.height
+                w = blob.width
+            data = np.array(blob.data, dtype=np.float32).reshape(c_o, c_i, h, w)
+            transformed.append(data)
+        return transformed
+
+    def adjust_parameters(self, node, data):
+        if not self.did_use_pb:
+            return data
+        # When using the protobuf-backend, each parameter initially has four dimensions.
+        # In certain cases (like FC layers), we want to eliminate the singleton dimensions.
+        # This implementation takes care of the common cases. However, it does leave the
+        # potential for future issues.
+        # The Caffe-backend does not suffer from this problem.
+        data = list(data)
+        squeeze_indices = [1]  # Squeeze biases.
+        if node.kind == NodeKind.InnerProduct:
+            squeeze_indices.append(0)  # Squeeze FC.
+
+        for idx in squeeze_indices:
+            if idx >= len(data):
+                continue
+
+            shape_old = data[idx].shape
+            data[idx] = np.squeeze(data[idx])
+            shape_new = data[idx].shape
+            if len(shape_old) != shape_new:
+                debug('squeeze idx:%d, with kind:%s,name:%s' % \
+                        (idx, node.kind, node.name))
+        return data
+
+    def __call__(self, graph):
+        for layer_name, data in self.params:
+            if layer_name in graph:
+                node = graph.get_node(layer_name)
+                node.data = self.adjust_parameters(node, data)
+            else:
+                notice('Ignoring parameters for non-existent layer: %s' % \
+                        layer_name)
+        return graph
+
+
+class DataReshaper(object):
+    def __init__(self, mapping, replace=True):
+        # A dictionary mapping NodeKind to the transposed order.
+        self.mapping = mapping
+        # The node kinds eligible for reshaping
+        self.reshaped_node_types = self.mapping.keys()
+        # If true, the reshaped data will replace the old one.
+        # Otherwise, it's set to the reshaped_data attribute.
+        self.replace = replace
+
+    def has_spatial_parent(self, node):
+        try:
+            parent = node.get_only_parent()
+            s = parent.output_shape
+            return s.height > 1 or s.width > 1
+        except KaffeError:
+            return False
+
+    def map(self, node_kind):
+        try:
+            return self.mapping[node_kind]
+        except KeyError:
+            raise
+            #raise KaffeError('Ordering not found for node kind: {}'.format(node_kind))
+
+    def __call__(self, graph):
+        for node in graph.nodes:
+            if node.data is None:
+                continue
+            if node.kind not in self.reshaped_node_types:
+                # Check for 2+ dimensional data
+                if any(len(tensor.shape) > 1 for tensor in node.data):
+                    notice('parmaters not reshaped for node: {}'.format(node))
+                continue
+            transpose_order = self.map(node.kind)
+            weights = node.data[0]
+            if (node.kind == NodeKind.InnerProduct
+                ) and self.has_spatial_parent(node):
+                # The FC layer connected to the spatial layer needs to be
+                # re-wired to match the new spatial ordering.
+                in_shape = node.get_only_parent().output_shape
+                fc_shape = weights.shape
+                output_channels = fc_shape[0]
+                weights = weights.reshape((output_channels, -1))
+                weights = weights.transpose(transpose_order)
+                node.reshaped_data = weights
+            else:
+                node.reshaped_data = weights.transpose(transpose_order)
+
+        if self.replace:
+            for node in graph.nodes:
+                if hasattr(node, 'reshaped_data'):
+                    # Set the weights
+                    node.data[0] = node.reshaped_data
+                    del node.reshaped_data
+        return graph
+
+
+class SubNodeFuser(object):
+    '''
+    An abstract helper for merging a single-child with its single-parent.
+    '''
+
+    def __call__(self, graph):
+        nodes = graph.nodes
+        fused_nodes = []
+        for node in nodes:
+            if len(node.parents) != 1:
+                # We're only fusing nodes with single parents
+                continue
+            parent = node.get_only_parent()
+            if len(parent.children) != 1:
+                # We can only fuse a node if its parent's
+                # value isn't used by any other node.
+                continue
+            if not self.is_eligible_pair(parent, node):
+                continue
+            # Rewrite the fused node's children to its parent.
+            for child in node.children:
+                child.parents.remove(node)
+                parent.add_child(child)
+            # Disconnect the fused node from the graph.
+            parent.children.remove(node)
+            fused_nodes.append(node)
+            # Let the sub-class merge the fused node in any arbitrary way.
+            self.merge(parent, node)
+        transformed_nodes = [node for node in nodes if node not in fused_nodes]
+        return graph.replaced(transformed_nodes)
+
+    def is_eligible_pair(self, parent, child):
+        '''Returns true if this parent/child pair is eligible for fusion.'''
+        raise NotImplementedError('Must be implemented by subclass.')
+
+    def merge(self, parent, child):
+        '''Merge the child node into the parent.'''
+        raise NotImplementedError('Must be implemented by subclass')
+
+
+class ReLUFuser(SubNodeFuser):
+    '''
+    Fuses rectified linear units with their parent nodes.
+    '''
+
+    def __init__(self, allowed_parent_types=None):
+        # Fuse ReLUs when the parent node is one of the given types.
+        # If None, all node types are eligible.
+        self.allowed_parent_types = allowed_parent_types
+
+    def is_eligible_pair(self, parent, child):
+        return ((self.allowed_parent_types is None or \
+                parent.kind in self.allowed_parent_types) and \
+                child.kind == NodeKind.ReLU)
+
+    def merge(self, parent, _):
+        parent.metadata['relu'] = True
+
+
+class BatchNormScaleBiasFuser(SubNodeFuser):
+    '''
+    The original batch normalization paper includes two learned
+    parameters: a scaling factor \gamma and a bias \beta.
+    Caffe's implementation does not include these two. However, it is commonly
+    replicated by adding a scaling+bias layer immidiately after the batch norm.
+
+    This fuser merges the scaling+bias layer with the batch norm.
+    '''
+
+    def is_eligible_pair(self, parent, child):
+        return (parent.kind == NodeKind.BatchNorm and \
+                child.kind == NodeKind.Scale and \
+                child.parameters.axis == 1 and \
+                child.parameters.bias_term == True)
+
+    def merge(self, parent, child):
+        parent.scale_bias_node = child
+
+
+class BatchNormPreprocessor(object):
+    '''
+    Prescale batch normalization parameters.
+    Concatenate gamma (scale) and beta (bias) terms if set.
+    '''
+
+    def __call__(self, graph):
+        for node in graph.nodes:
+            if node.kind != NodeKind.BatchNorm:
+                continue
+            assert node.data is not None
+            assert len(node.data) == 3
+            node.data = [np.squeeze(i) for i in node.data]
+            mean, variance, scale = node.data
+            # Prescale the stats
+            scaling_factor = 1.0 / scale if scale != 0 else 0
+            mean *= scaling_factor
+            variance *= scaling_factor
+            # Replace with the updated values
+            node.data = [mean, variance]
+            if hasattr(node, 'scale_bias_node'):
+                # Include the scale and bias terms
+                gamma, beta = node.scale_bias_node.data
+                node.data += [np.squeeze(i) for i in [gamma, beta]]
+        return graph
+
+
+class NodeRenamer(object):
+    '''
+    Renames nodes in the graph using a given unary function that
+    accepts a node and returns its new name.
+    '''
+
+    def __init__(self, renamer):
+        self.renamer = renamer
+
+    def __call__(self, graph):
+        for node in graph.nodes:
+            node.name = self.renamer(node)
+        return graph
+
+
+class ParameterNamer(object):
+    '''
+    Convert layer data arrays to a dictionary mapping parameter names to their values.
+    '''
+
+    def __call__(self, graph):
+        for node in graph.nodes:
+            if node.data is None:
+                continue
+            if node.kind in (NodeKind.Convolution, NodeKind.InnerProduct):
+                names = ('weights', )
+                if node.parameters.bias_term:
+                    names += ('biases', )
+            elif node.kind == NodeKind.BatchNorm:
+                names = ('mean', 'variance')
+                if len(node.data) == 4:
+                    names += ('scale', 'offset')
+            else:
+                warn('Unhandled parameters: {}'.format(node.kind))
+                continue
+            assert len(names) == len(node.data)
+            node.data = dict(zip(names, node.data))
+        return graph
diff --git a/fluid/image_classification/caffe2fluid/proto/caffe.proto b/fluid/image_classification/caffe2fluid/proto/caffe.proto
new file mode 100644
index 0000000000000000000000000000000000000000..18eb5ca6491cbc59297c36854ddbd2a46ebfab9e
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/proto/caffe.proto
@@ -0,0 +1,1411 @@
+syntax = "proto2";
+
+package caffe;
+
+// Specifies the shape (dimensions) of a Blob.
+message BlobShape { repeated int64 dim = 1 [ packed = true ]; }
+
+message BlobProto {
+  optional BlobShape shape = 7;
+  repeated float data = 5 [ packed = true ];
+  repeated float diff = 6 [ packed = true ];
+  repeated double double_data = 8 [ packed = true ];
+  repeated double double_diff = 9 [ packed = true ];
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  optional int32 num = 1 [ default = 0 ];
+  optional int32 channels = 2 [ default = 0 ];
+  optional int32 height = 3 [ default = 0 ];
+  optional int32 width = 4 [ default = 0 ];
+}
+
+// The BlobProtoVector is simply a way to pass multiple blobproto instances
+// around.
+message BlobProtoVector { repeated BlobProto blobs = 1; }
+
+message Datum {
+  optional int32 channels = 1;
+  optional int32 height = 2;
+  optional int32 width = 3;
+  // the actual image data, in bytes
+  optional bytes data = 4;
+  optional int32 label = 5;
+  // Optionally, the datum could also hold float data.
+  repeated float float_data = 6;
+  // If true data contains an encoded image that need to be decoded
+  optional bool encoded = 7 [ default = false ];
+}
+
+message FillerParameter {
+  // The filler type.
+  optional string type = 1 [ default = 'constant' ];
+  optional float value = 2 [ default = 0 ]; // the value in constant filler
+  optional float min = 3 [ default = 0 ];   // the min value in uniform filler
+  optional float max = 4 [ default = 1 ];   // the max value in uniform filler
+  optional float mean = 5 [ default = 0 ];  // the mean value in Gaussian filler
+  optional float std = 6 [ default = 1 ];   // the std value in Gaussian filler
+  // The expected number of non-zero output weights for a given input in
+  // Gaussian filler -- the default -1 means don't perform sparsification.
+  optional int32 sparse = 7 [ default = -1 ];
+  // Normalize the filler variance by fan_in, fan_out, or their average.
+  // Applies to 'xavier' and 'msra' fillers.
+  enum VarianceNorm {
+    FAN_IN = 0;
+    FAN_OUT = 1;
+    AVERAGE = 2;
+  }
+  optional VarianceNorm variance_norm = 8 [ default = FAN_IN ];
+}
+
+message NetParameter {
+  optional string name = 1; // consider giving the network a name
+  // DEPRECATED. See InputParameter. The input blobs to the network.
+  repeated string input = 3;
+  // DEPRECATED. See InputParameter. The shape of the input blobs.
+  repeated BlobShape input_shape = 8;
+
+  // 4D input dimensions -- deprecated.  Use "input_shape" instead.
+  // If specified, for each input blob there should be four
+  // values specifying the num, channels, height and width of the input blob.
+  // Thus, there should be a total of (4 * #input) numbers.
+  repeated int32 input_dim = 4;
+
+  // Whether the network will force every layer to carry out backward operation.
+  // If set False, then whether to carry out backward is determined
+  // automatically according to the net structure and learning rates.
+  optional bool force_backward = 5 [ default = false ];
+  // The current "state" of the network, including the phase, level, and stage.
+  // Some layers may be included/excluded depending on this state and the states
+  // specified in the layers' include and exclude fields.
+  optional NetState state = 6;
+
+  // Print debugging information about results while running Net::Forward,
+  // Net::Backward, and Net::Update.
+  optional bool debug_info = 7 [ default = false ];
+
+  // The layers that make up the net.  Each of their configurations, including
+  // connectivity and behavior, is specified as a LayerParameter.
+  repeated LayerParameter layer = 100; // ID 100 so layers are printed last.
+
+  // DEPRECATED: use 'layer' instead.
+  repeated V1LayerParameter layers = 2;
+}
+
+// NOTE
+// Update the next available ID when you add a new SolverParameter field.
+//
+// SolverParameter next available ID: 42 (last added: layer_wise_reduce)
+message SolverParameter {
+  //////////////////////////////////////////////////////////////////////////////
+  // Specifying the train and test networks
+  //
+  // Exactly one train net must be specified using one of the following fields:
+  //     train_net_param, train_net, net_param, net
+  // One or more test nets may be specified using any of the following fields:
+  //     test_net_param, test_net, net_param, net
+  // If more than one test net field is specified (e.g., both net and
+  // test_net are specified), they will be evaluated in the field order given
+  // above: (1) test_net_param, (2) test_net, (3) net_param/net.
+  // A test_iter must be specified for each test_net.
+  // A test_level and/or a test_stage may also be specified for each test_net.
+  //////////////////////////////////////////////////////////////////////////////
+
+  // Proto filename for the train net, possibly combined with one or more
+  // test nets.
+  optional string net = 24;
+  // Inline train net param, possibly combined with one or more test nets.
+  optional NetParameter net_param = 25;
+
+  optional string train_net = 1; // Proto filename for the train net.
+  repeated string test_net = 2;  // Proto filenames for the test nets.
+  optional NetParameter train_net_param = 21; // Inline train net params.
+  repeated NetParameter test_net_param = 22;  // Inline test net params.
+
+  // The states for the train/test nets. Must be unspecified or
+  // specified once per net.
+  //
+  // By default, train_state will have phase = TRAIN,
+  // and all test_state's will have phase = TEST.
+  // Other defaults are set according to the NetState defaults.
+  optional NetState train_state = 26;
+  repeated NetState test_state = 27;
+
+  // The number of iterations for each test net.
+  repeated int32 test_iter = 3;
+
+  // The number of iterations between two testing phases.
+  optional int32 test_interval = 4 [ default = 0 ];
+  optional bool test_compute_loss = 19 [ default = false ];
+  // If true, run an initial test pass before the first iteration,
+  // ensuring memory availability and printing the starting value of the loss.
+  optional bool test_initialization = 32 [ default = true ];
+  optional float base_lr = 5; // The base learning rate
+  // the number of iterations between displaying info. If display = 0, no info
+  // will be displayed.
+  optional int32 display = 6;
+  // Display the loss averaged over the last average_loss iterations
+  optional int32 average_loss = 33 [ default = 1 ];
+  optional int32 max_iter = 7; // the maximum number of iterations
+  // accumulate gradients over `iter_size` x `batch_size` instances
+  optional int32 iter_size = 36 [ default = 1 ];
+
+  // The learning rate decay policy. The currently implemented learning rate
+  // policies are as follows:
+  //    - fixed: always return base_lr.
+  //    - step: return base_lr * gamma ^ (floor(iter / step))
+  //    - exp: return base_lr * gamma ^ iter
+  //    - inv: return base_lr * (1 + gamma * iter) ^ (- power)
+  //    - multistep: similar to step but it allows non uniform steps defined by
+  //      stepvalue
+  //    - poly: the effective learning rate follows a polynomial decay, to be
+  //      zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power)
+  //    - sigmoid: the effective learning rate follows a sigmod decay
+  //      return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))
+  //
+  // where base_lr, max_iter, gamma, step, stepvalue and power are defined
+  // in the solver parameter protocol buffer, and iter is the current iteration.
+  optional string lr_policy = 8;
+  optional float gamma = 9;     // The parameter to compute the learning rate.
+  optional float power = 10;    // The parameter to compute the learning rate.
+  optional float momentum = 11; // The momentum value.
+  optional float weight_decay = 12; // The weight decay.
+  // regularization types supported: L1 and L2
+  // controlled by weight_decay
+  optional string regularization_type = 29 [ default = "L2" ];
+  // the stepsize for learning rate policy "step"
+  optional int32 stepsize = 13;
+  // the stepsize for learning rate policy "multistep"
+  repeated int32 stepvalue = 34;
+
+  // Set clip_gradients to >= 0 to clip parameter gradients to that L2 norm,
+  // whenever their actual L2 norm is larger.
+  optional float clip_gradients = 35 [ default = -1 ];
+
+  optional int32 snapshot = 14 [ default = 0 ]; // The snapshot interval
+  optional string snapshot_prefix = 15;         // The prefix for the snapshot.
+  // whether to snapshot diff in the results or not. Snapshotting diff will help
+  // debugging but the final protocol buffer size will be much larger.
+  optional bool snapshot_diff = 16 [ default = false ];
+  enum SnapshotFormat {
+    HDF5 = 0;
+    BINARYPROTO = 1;
+  }
+  optional SnapshotFormat snapshot_format = 37 [ default = BINARYPROTO ];
+  // the mode solver will use: 0 for CPU and 1 for GPU. Use GPU in default.
+  enum SolverMode {
+    CPU = 0;
+    GPU = 1;
+  }
+  optional SolverMode solver_mode = 17 [ default = GPU ];
+  // the device_id will that be used in GPU mode. Use device_id = 0 in default.
+  optional int32 device_id = 18 [ default = 0 ];
+  // If non-negative, the seed with which the Solver will initialize the Caffe
+  // random number generator -- useful for reproducible results. Otherwise,
+  // (and by default) initialize using a seed derived from the system clock.
+  optional int64 random_seed = 20 [ default = -1 ];
+
+  // type of the solver
+  optional string type = 40 [ default = "SGD" ];
+
+  // numerical stability for RMSProp, AdaGrad and AdaDelta and Adam
+  optional float delta = 31 [ default = 1e-8 ];
+  // parameters for the Adam solver
+  optional float momentum2 = 39 [ default = 0.999 ];
+
+  // RMSProp decay value
+  // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
+  optional float rms_decay = 38 [ default = 0.99 ];
+
+  // If true, print information about the state of the net that may help with
+  // debugging learning problems.
+  optional bool debug_info = 23 [ default = false ];
+
+  // If false, don't save a snapshot after training finishes.
+  optional bool snapshot_after_train = 28 [ default = true ];
+
+  // DEPRECATED: old solver enum types, use string instead
+  enum SolverType {
+    SGD = 0;
+    NESTEROV = 1;
+    ADAGRAD = 2;
+    RMSPROP = 3;
+    ADADELTA = 4;
+    ADAM = 5;
+  }
+  // DEPRECATED: use type instead of solver_type
+  optional SolverType solver_type = 30 [ default = SGD ];
+
+  // Overlap compute and communication for data parallel training
+  optional bool layer_wise_reduce = 41 [ default = true ];
+}
+
+// A message that stores the solver snapshots
+message SolverState {
+  optional int32 iter = 1;         // The current iteration
+  optional string learned_net = 2; // The file that stores the learned net.
+  repeated BlobProto history = 3;  // The history for sgd solvers
+  optional int32 current_step = 4
+      [ default = 0 ]; // The current step for learning rate
+}
+
+enum Phase {
+  TRAIN = 0;
+  TEST = 1;
+}
+
+message NetState {
+  optional Phase phase = 1 [ default = TEST ];
+  optional int32 level = 2 [ default = 0 ];
+  repeated string stage = 3;
+}
+
+message NetStateRule {
+  // Set phase to require the NetState have a particular phase (TRAIN or TEST)
+  // to meet this rule.
+  optional Phase phase = 1;
+
+  // Set the minimum and/or maximum levels in which the layer should be used.
+  // Leave undefined to meet the rule regardless of level.
+  optional int32 min_level = 2;
+  optional int32 max_level = 3;
+
+  // Customizable sets of stages to include or exclude.
+  // The net must have ALL of the specified stages and NONE of the specified
+  // "not_stage"s to meet the rule.
+  // (Use multiple NetStateRules to specify conjunctions of stages.)
+  repeated string stage = 4;
+  repeated string not_stage = 5;
+}
+
+// Specifies training parameters (multipliers on global learning constants,
+// and the name and other settings used for weight sharing).
+message ParamSpec {
+  // The names of the parameter blobs -- useful for sharing parameters among
+  // layers, but never required otherwise.  To share a parameter between two
+  // layers, give it a (non-empty) name.
+  optional string name = 1;
+
+  // Whether to require shared weights to have the same shape, or just the same
+  // count -- defaults to STRICT if unspecified.
+  optional DimCheckMode share_mode = 2;
+  enum DimCheckMode {
+    // STRICT (default) requires that num, channels, height, width each match.
+    STRICT = 0;
+    // PERMISSIVE requires only the count (num*channels*height*width) to match.
+    PERMISSIVE = 1;
+  }
+
+  // The multiplier on the global learning rate for this parameter.
+  optional float lr_mult = 3 [ default = 1.0 ];
+
+  // The multiplier on the global weight decay for this parameter.
+  optional float decay_mult = 4 [ default = 1.0 ];
+}
+
+// NOTE
+// Update the next available ID when you add a new LayerParameter field.
+//
+// LayerParameter next available layer-specific ID: 147 (last added:
+// recurrent_param)
+message LayerParameter {
+  optional string name = 1;   // the layer name
+  optional string type = 2;   // the layer type
+  repeated string bottom = 3; // the name of each bottom blob
+  repeated string top = 4;    // the name of each top blob
+
+  // The train / test phase for computation.
+  optional Phase phase = 10;
+
+  // The amount of weight to assign each top blob in the objective.
+  // Each layer assigns a default value, usually of either 0 or 1,
+  // to each top blob.
+  repeated float loss_weight = 5;
+
+  // Specifies training parameters (multipliers on global learning constants,
+  // and the name and other settings used for weight sharing).
+  repeated ParamSpec param = 6;
+
+  // The blobs containing the numeric parameters of the layer.
+  repeated BlobProto blobs = 7;
+
+  // Specifies whether to backpropagate to each bottom. If unspecified,
+  // Caffe will automatically infer whether each input needs backpropagation
+  // to compute parameter gradients. If set to true for some inputs,
+  // backpropagation to those inputs is forced; if set false for some inputs,
+  // backpropagation to those inputs is skipped.
+  //
+  // The size must be either 0 or equal to the number of bottoms.
+  repeated bool propagate_down = 11;
+
+  // Rules controlling whether and when a layer is included in the network,
+  // based on the current NetState.  You may specify a non-zero number of rules
+  // to include OR exclude, but not both.  If no include or exclude rules are
+  // specified, the layer is always included.  If the current NetState meets
+  // ANY (i.e., one or more) of the specified rules, the layer is
+  // included/excluded.
+  repeated NetStateRule include = 8;
+  repeated NetStateRule exclude = 9;
+
+  // Parameters for data pre-processing.
+  optional TransformationParameter transform_param = 100;
+
+  // Parameters shared by loss layers.
+  optional LossParameter loss_param = 101;
+
+  // Layer type-specific parameters.
+  //
+  // Note: certain layers may have more than one computational engine
+  // for their implementation. These layers include an Engine type and
+  // engine parameter for selecting the implementation.
+  // The default for the engine is set by the ENGINE switch at compile-time.
+  optional AccuracyParameter accuracy_param = 102;
+  optional ArgMaxParameter argmax_param = 103;
+  optional BatchNormParameter batch_norm_param = 139;
+  optional BiasParameter bias_param = 141;
+  optional ConcatParameter concat_param = 104;
+  optional ContrastiveLossParameter contrastive_loss_param = 105;
+  optional ConvolutionParameter convolution_param = 106;
+  optional CropParameter crop_param = 144;
+  optional DataParameter data_param = 107;
+  optional DropoutParameter dropout_param = 108;
+  optional DummyDataParameter dummy_data_param = 109;
+  optional EltwiseParameter eltwise_param = 110;
+  optional ELUParameter elu_param = 140;
+  optional EmbedParameter embed_param = 137;
+  optional ExpParameter exp_param = 111;
+  optional FlattenParameter flatten_param = 135;
+  optional HDF5DataParameter hdf5_data_param = 112;
+  optional HDF5OutputParameter hdf5_output_param = 113;
+  optional HingeLossParameter hinge_loss_param = 114;
+  optional ImageDataParameter image_data_param = 115;
+  optional InfogainLossParameter infogain_loss_param = 116;
+  optional InnerProductParameter inner_product_param = 117;
+  optional InputParameter input_param = 143;
+  optional LogParameter log_param = 134;
+  optional LRNParameter lrn_param = 118;
+  optional MemoryDataParameter memory_data_param = 119;
+  optional MVNParameter mvn_param = 120;
+  optional ParameterParameter parameter_param = 145;
+  optional PoolingParameter pooling_param = 121;
+  optional PowerParameter power_param = 122;
+  optional PReLUParameter prelu_param = 131;
+  optional PythonParameter python_param = 130;
+  optional RecurrentParameter recurrent_param = 146;
+  optional ReductionParameter reduction_param = 136;
+  optional ReLUParameter relu_param = 123;
+  optional ReshapeParameter reshape_param = 133;
+  optional ScaleParameter scale_param = 142;
+  optional SigmoidParameter sigmoid_param = 124;
+  optional SoftmaxParameter softmax_param = 125;
+  optional SPPParameter spp_param = 132;
+  optional SliceParameter slice_param = 126;
+  optional TanHParameter tanh_param = 127;
+  optional ThresholdParameter threshold_param = 128;
+  optional TileParameter tile_param = 138;
+  optional WindowDataParameter window_data_param = 129;
+}
+
+// Message that stores parameters used to apply transformation
+// to the data layer's data
+message TransformationParameter {
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 1 [ default = 1 ];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 2 [ default = false ];
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 3 [ default = 0 ];
+  // mean_file and mean_value cannot be specified at the same time
+  optional string mean_file = 4;
+  // if specified can be repeated once (would subtract it from all the channels)
+  // or can be repeated the same number of times as channels
+  // (would subtract them from the corresponding channel)
+  repeated float mean_value = 5;
+  // Force the decoded image to have 3 color channels.
+  optional bool force_color = 6 [ default = false ];
+  // Force the decoded image to have 1 color channels.
+  optional bool force_gray = 7 [ default = false ];
+}
+
+// Message that stores parameters shared by loss layers
+message LossParameter {
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 1;
+  // How to normalize the loss for loss layers that aggregate across batches,
+  // spatial dimensions, or other dimensions.  Currently only implemented in
+  // SoftmaxWithLoss and SigmoidCrossEntropyLoss layers.
+  enum NormalizationMode {
+    // Divide by the number of examples in the batch times spatial dimensions.
+    // Outputs that receive the ignore label will NOT be ignored in computing
+    // the normalization factor.
+    FULL = 0;
+    // Divide by the total number of output locations that do not take the
+    // ignore_label.  If ignore_label is not set, this behaves like FULL.
+    VALID = 1;
+    // Divide by the batch size.
+    BATCH_SIZE = 2;
+    // Do not normalize the loss.
+    NONE = 3;
+  }
+  // For historical reasons, the default normalization for
+  // SigmoidCrossEntropyLoss is BATCH_SIZE and *not* VALID.
+  optional NormalizationMode normalization = 3 [ default = VALID ];
+  // Deprecated.  Ignored if normalization is specified.  If normalization
+  // is not specified, then setting this to false will be equivalent to
+  // normalization = BATCH_SIZE to be consistent with previous behavior.
+  optional bool normalize = 2;
+}
+
+// Messages that store parameters used by individual layer types follow, in
+// alphabetical order.
+
+message AccuracyParameter {
+  // When computing accuracy, count as correct by comparing the true label to
+  // the top k scoring classes.  By default, only compare to the top scoring
+  // class (i.e. argmax).
+  optional uint32 top_k = 1 [ default = 1 ];
+
+  // The "label" axis of the prediction blob, whose argmax corresponds to the
+  // predicted label -- may be negative to index from the end (e.g., -1 for the
+  // last axis).  For example, if axis == 1 and the predictions are
+  // (N x C x H x W), the label blob is expected to contain N*H*W ground truth
+  // labels with integer values in {0, 1, ..., C-1}.
+  optional int32 axis = 2 [ default = 1 ];
+
+  // If specified, ignore instances with the given label.
+  optional int32 ignore_label = 3;
+}
+
+message ArgMaxParameter {
+  // If true produce pairs (argmax, maxval)
+  optional bool out_max_val = 1 [ default = false ];
+  optional uint32 top_k = 2 [ default = 1 ];
+  // The axis along which to maximise -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // By default ArgMaxLayer maximizes over the flattened trailing dimensions
+  // for each index of the first / num dimension.
+  optional int32 axis = 3;
+}
+
+message ConcatParameter {
+  // The axis along which to concatenate -- may be negative to index from the
+  // end (e.g., -1 for the last axis).  Other axes must have the
+  // same dimension for all the bottom blobs.
+  // By default, ConcatLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 2 [ default = 1 ];
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 concat_dim = 1 [ default = 1 ];
+}
+
+message BatchNormParameter {
+  // If false, normalization is performed over the current mini-batch
+  // and global statistics are accumulated (but not yet used) by a moving
+  // average.
+  // If true, those accumulated mean and variance values are used for the
+  // normalization.
+  // By default, it is set to false when the network is in the training
+  // phase and true when the network is in the testing phase.
+  optional bool use_global_stats = 1;
+  // What fraction of the moving average remains each iteration?
+  // Smaller values make the moving average decay faster, giving more
+  // weight to the recent values.
+  // Each iteration updates the moving average @f$S_{t-1}@f$ with the
+  // current mean @f$ Y_t @f$ by
+  // @f$ S_t = (1-\beta)Y_t + \beta \cdot S_{t-1} @f$, where @f$ \beta @f$
+  // is the moving_average_fraction parameter.
+  optional float moving_average_fraction = 2 [ default = .999 ];
+  // Small value to add to the variance estimate so that we don't divide by
+  // zero.
+  optional float eps = 3 [ default = 1e-5 ];
+}
+
+message BiasParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar bias.
+  optional int32 axis = 1 [ default = 1 ];
+
+  // (num_axes is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the bias
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to add a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [ default = 1 ];
+
+  // (filler is ignored unless just one bottom is given and the bias is
+  // a learned parameter of the layer.)
+  // The initialization for the learned bias parameter.
+  // Default is the zero (0) initialization, resulting in the BiasLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+}
+
+message ContrastiveLossParameter {
+  // margin for dissimilar pair
+  optional float margin = 1 [ default = 1.0 ];
+  // The first implementation of this cost did not exactly match the cost of
+  // Hadsell et al 2006 -- using (margin - d^2) instead of (margin - d)^2.
+  // legacy_version = false (the default) uses (margin - d)^2 as proposed in the
+  // Hadsell paper. New models should probably use this version.
+  // legacy_version = true uses (margin - d^2). This is kept to support /
+  // reproduce existing models and results
+  optional bool legacy_version = 2 [ default = false ];
+}
+
+message ConvolutionParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [ default = true ]; // whether to have bias terms
+
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in all spatial dimensions, or once per spatial dimension.
+  repeated uint32 pad = 3;         // The padding size; defaults to 0
+  repeated uint32 kernel_size = 4; // The kernel size
+  repeated uint32 stride = 6;      // The stride; defaults to 1
+  // Factor used to dilate the kernel, (implicitly) zero-filling the resulting
+  // holes. (Kernel dilation is sometimes referred to by its use in the
+  // algorithme à trous from Holschneider et al. 1987.)
+  repeated uint32 dilation = 18; // The dilation; defaults to 1
+
+  // For 2D convolution only, the *_h and *_w versions may also be used to
+  // specify both spatial dimensions.
+  optional uint32 pad_h = 9 [ default = 0 ];  // The padding height (2D only)
+  optional uint32 pad_w = 10 [ default = 0 ]; // The padding width (2D only)
+  optional uint32 kernel_h = 11;              // The kernel height (2D only)
+  optional uint32 kernel_w = 12;              // The kernel width (2D only)
+  optional uint32 stride_h = 13;              // The stride height (2D only)
+  optional uint32 stride_w = 14;              // The stride width (2D only)
+
+  optional uint32 group = 5 [ default = 1 ]; // The group size for group conv
+
+  optional FillerParameter weight_filler = 7; // The filler for the weight
+  optional FillerParameter bias_filler = 8;   // The filler for the bias
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 15 [ default = DEFAULT ];
+
+  // The axis to interpret as "channels" when performing convolution.
+  // Preceding dimensions are treated as independent inputs;
+  // succeeding dimensions are treated as "spatial".
+  // With (N, C, H, W) inputs, and axis == 1 (the default), we perform
+  // N independent 2D convolutions, sliding C-channel (or (C/g)-channels, for
+  // groups g>1) filters across the spatial axes (H, W) of the input.
+  // With (N, C, D, H, W) inputs, and axis == 1, we perform
+  // N independent 3D convolutions, sliding (C/g)-channels
+  // filters across the spatial axes (D, H, W) of the input.
+  optional int32 axis = 16 [ default = 1 ];
+
+  // Whether to force use of the general ND convolution, even if a specific
+  // implementation for blobs of the appropriate number of spatial dimensions
+  // is available. (Currently, there is only a 2D-specific convolution
+  // implementation; for input blobs with num_axes != 2, this option is
+  // ignored and the ND implementation will be used.)
+  optional bool force_nd_im2col = 17 [ default = false ];
+}
+
+message CropParameter {
+  // To crop, elements of the first bottom are selected to fit the dimensions
+  // of the second, reference bottom. The crop is configured by
+  // - the crop `axis` to pick the dimensions for cropping
+  // - the crop `offset` to set the shift for all/each dimension
+  // to align the cropped bottom with the reference bottom.
+  // All dimensions up to but excluding `axis` are preserved, while
+  // the dimensions including and trailing `axis` are cropped.
+  // If only one `offset` is set, then all dimensions are offset by this amount.
+  // Otherwise, the number of offsets must equal the number of cropped axes to
+  // shift the crop in each dimension accordingly.
+  // Note: standard dimensions are N,C,H,W so the default is a spatial crop,
+  // and `axis` may be negative to index from the end (e.g., -1 for the last
+  // axis).
+  optional int32 axis = 1 [ default = 2 ];
+  repeated uint32 offset = 2;
+}
+
+message DataParameter {
+  enum DB {
+    LEVELDB = 0;
+    LMDB = 1;
+  }
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  // DEPRECATED. Each solver accesses a different subset of the database.
+  optional uint32 rand_skip = 7 [ default = 0 ];
+  optional DB backend = 8 [ default = LEVELDB ];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [ default = 1 ];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to
+  // randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [ default = 0 ];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly
+  // mirror
+  // data.
+  optional bool mirror = 6 [ default = false ];
+  // Force the encoded image to have 3 color channels
+  optional bool force_encoded_color = 9 [ default = false ];
+  // Prefetch queue (Increase if data feeding bandwidth varies, within the
+  // limit of device memory for GPU training)
+  optional uint32 prefetch = 10 [ default = 4 ];
+}
+
+message DropoutParameter {
+  optional float dropout_ratio = 1 [ default = 0.5 ]; // dropout ratio
+}
+
+// DummyDataLayer fills any number of arbitrarily shaped blobs with random
+// (or constant) data generated by "Fillers" (see "message FillerParameter").
+message DummyDataParameter {
+  // This layer produces N >= 1 top blobs.  DummyDataParameter must specify 1 or
+  // N
+  // shape fields, and 0, 1 or N data_fillers.
+  //
+  // If 0 data_fillers are specified, ConstantFiller with a value of 0 is used.
+  // If 1 data_filler is specified, it is applied to all top blobs.  If N are
+  // specified, the ith is applied to the ith top blob.
+  repeated FillerParameter data_filler = 1;
+  repeated BlobShape shape = 6;
+
+  // 4D dimensions -- deprecated.  Use "shape" instead.
+  repeated uint32 num = 2;
+  repeated uint32 channels = 3;
+  repeated uint32 height = 4;
+  repeated uint32 width = 5;
+}
+
+message EltwiseParameter {
+  enum EltwiseOp {
+    PROD = 0;
+    SUM = 1;
+    MAX = 2;
+  }
+  optional EltwiseOp operation = 1 [ default = SUM ]; // element-wise operation
+  repeated float coeff = 2; // blob-wise coefficient for SUM operation
+
+  // Whether to use an asymptotically slower (for >2 inputs) but stabler method
+  // of computing the gradient for the PROD operation. (No effect for SUM op.)
+  optional bool stable_prod_grad = 3 [ default = true ];
+}
+
+// Message that stores parameters used by ELULayer
+message ELUParameter {
+  // Described in:
+  // Clevert, D.-A., Unterthiner, T., & Hochreiter, S. (2015). Fast and Accurate
+  // Deep Network Learning by Exponential Linear Units (ELUs). arXiv
+  optional float alpha = 1 [ default = 1 ];
+}
+
+// Message that stores parameters used by EmbedLayer
+message EmbedParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  // The input is given as integers to be interpreted as one-hot
+  // vector indices with dimension num_input.  Hence num_input should be
+  // 1 greater than the maximum possible input value.
+  optional uint32 input_dim = 2;
+
+  optional bool bias_term = 3 [ default = true ]; // Whether to use a bias term
+  optional FillerParameter weight_filler = 4;     // The filler for the weight
+  optional FillerParameter bias_filler = 5;       // The filler for the bias
+}
+
+// Message that stores parameters used by ExpLayer
+message ExpParameter {
+  // ExpLayer computes outputs y = base ^ (shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = exp(shift + scale * x).
+  optional float base = 1 [ default = -1.0 ];
+  optional float scale = 2 [ default = 1.0 ];
+  optional float shift = 3 [ default = 0.0 ];
+}
+
+/// Message that stores parameters used by FlattenLayer
+message FlattenParameter {
+  // The first axis to flatten: all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 1 [ default = 1 ];
+
+  // The last axis to flatten: all following axes are retained in the output.
+  // May be negative to index from the end (e.g., the default -1 for the last
+  // axis).
+  optional int32 end_axis = 2 [ default = -1 ];
+}
+
+// Message that stores parameters used by HDF5DataLayer
+message HDF5DataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 2;
+
+  // Specify whether to shuffle the data.
+  // If shuffle == true, the ordering of the HDF5 files is shuffled,
+  // and the ordering of data within any given HDF5 file is shuffled,
+  // but data between different files are not interleaved; all of a file's
+  // data are output (in a random order) before moving onto another file.
+  optional bool shuffle = 3 [ default = false ];
+}
+
+message HDF5OutputParameter { optional string file_name = 1; }
+
+message HingeLossParameter {
+  enum Norm {
+    L1 = 1;
+    L2 = 2;
+  }
+  // Specify the Norm to use L1 or L2
+  optional Norm norm = 1 [ default = L1 ];
+}
+
+message ImageDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // Specify the batch size.
+  optional uint32 batch_size = 4 [ default = 1 ];
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 7 [ default = 0 ];
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  optional bool shuffle = 8 [ default = false ];
+  // It will also resize images if new_height or new_width are not zero.
+  optional uint32 new_height = 9 [ default = 0 ];
+  optional uint32 new_width = 10 [ default = 0 ];
+  // Specify if the images are color or gray
+  optional bool is_color = 11 [ default = true ];
+  // DEPRECATED. See TransformationParameter. For data pre-processing, we can do
+  // simple scaling and subtracting the data mean, if provided. Note that the
+  // mean subtraction is always carried out before scaling.
+  optional float scale = 2 [ default = 1 ];
+  optional string mean_file = 3;
+  // DEPRECATED. See TransformationParameter. Specify if we would like to
+  // randomly
+  // crop an image.
+  optional uint32 crop_size = 5 [ default = 0 ];
+  // DEPRECATED. See TransformationParameter. Specify if we want to randomly
+  // mirror
+  // data.
+  optional bool mirror = 6 [ default = false ];
+  optional string root_folder = 12 [ default = "" ];
+}
+
+message InfogainLossParameter {
+  // Specify the infogain matrix source.
+  optional string source = 1;
+  optional int32 axis = 2 [ default = 1 ]; // axis of prob
+}
+
+message InnerProductParameter {
+  optional uint32 num_output = 1; // The number of outputs for the layer
+  optional bool bias_term = 2 [ default = true ]; // whether to have bias terms
+  optional FillerParameter weight_filler = 3;     // The filler for the weight
+  optional FillerParameter bias_filler = 4;       // The filler for the bias
+
+  // The first axis to be lumped into a single inner product computation;
+  // all preceding axes are retained in the output.
+  // May be negative to index from the end (e.g., -1 for the last axis).
+  optional int32 axis = 5 [ default = 1 ];
+  // Specify whether to transpose the weight matrix or not.
+  // If transpose == true, any operations will be performed on the transpose
+  // of the weight matrix. The weight matrix itself is not going to be
+  // transposed
+  // but rather the transfer flag of operations will be toggled accordingly.
+  optional bool transpose = 6 [ default = false ];
+}
+
+message InputParameter {
+  // This layer produces N >= 1 top blob(s) to be assigned manually.
+  // Define N shapes to set a shape for each top.
+  // Define 1 shape to set the same shape for every top.
+  // Define no shape to defer to reshaping manually.
+  repeated BlobShape shape = 1;
+}
+
+// Message that stores parameters used by LogLayer
+message LogParameter {
+  // LogLayer computes outputs y = log_base(shift + scale * x), for base > 0.
+  // Or if base is set to the default (-1), base is set to e,
+  // so y = ln(shift + scale * x) = log_e(shift + scale * x)
+  optional float base = 1 [ default = -1.0 ];
+  optional float scale = 2 [ default = 1.0 ];
+  optional float shift = 3 [ default = 0.0 ];
+}
+
+// Message that stores parameters used by LRNLayer
+message LRNParameter {
+  optional uint32 local_size = 1 [ default = 5 ];
+  optional float alpha = 2 [ default = 1. ];
+  optional float beta = 3 [ default = 0.75 ];
+  enum NormRegion {
+    ACROSS_CHANNELS = 0;
+    WITHIN_CHANNEL = 1;
+  }
+  optional NormRegion norm_region = 4 [ default = ACROSS_CHANNELS ];
+  optional float k = 5 [ default = 1. ];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [ default = DEFAULT ];
+}
+
+message MemoryDataParameter {
+  optional uint32 batch_size = 1;
+  optional uint32 channels = 2;
+  optional uint32 height = 3;
+  optional uint32 width = 4;
+}
+
+message MVNParameter {
+  // This parameter can be set to false to normalize mean only
+  optional bool normalize_variance = 1 [ default = true ];
+
+  // This parameter can be set to true to perform DNN-like MVN
+  optional bool across_channels = 2 [ default = false ];
+
+  // Epsilon for not dividing by zero while normalizing variance
+  optional float eps = 3 [ default = 1e-9 ];
+}
+
+message ParameterParameter { optional BlobShape shape = 1; }
+
+message PoolingParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 1 [ default = MAX ]; // The pooling method
+  // Pad, kernel size, and stride are all given as a single value for equal
+  // dimensions in height and width or as Y, X pairs.
+  optional uint32 pad = 4 [ default = 0 ];   // The padding size (equal in Y, X)
+  optional uint32 pad_h = 9 [ default = 0 ]; // The padding height
+  optional uint32 pad_w = 10 [ default = 0 ]; // The padding width
+  optional uint32 kernel_size = 2;            // The kernel size (square)
+  optional uint32 kernel_h = 5;               // The kernel height
+  optional uint32 kernel_w = 6;               // The kernel width
+  optional uint32 stride = 3 [ default = 1 ]; // The stride (equal in Y, X)
+  optional uint32 stride_h = 7;               // The stride height
+  optional uint32 stride_w = 8;               // The stride width
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 11 [ default = DEFAULT ];
+  // If global_pooling then it will pool over the size of the bottom by doing
+  // kernel_h = bottom->height and kernel_w = bottom->width
+  optional bool global_pooling = 12 [ default = false ];
+}
+
+message PowerParameter {
+  // PowerLayer computes outputs y = (shift + scale * x) ^ power.
+  optional float power = 1 [ default = 1.0 ];
+  optional float scale = 2 [ default = 1.0 ];
+  optional float shift = 3 [ default = 0.0 ];
+}
+
+message PythonParameter {
+  optional string module = 1;
+  optional string layer = 2;
+  // This value is set to the attribute `param_str` of the `PythonLayer` object
+  // in Python before calling the `setup()` method. This could be a number,
+  // string, dictionary in Python dict format, JSON, etc. You may parse this
+  // string in `setup` method and use it in `forward` and `backward`.
+  optional string param_str = 3 [ default = ''];
+  // DEPRECATED
+  optional bool share_in_parallel = 4 [ default = false ];
+}
+
+// Message that stores parameters used by RecurrentLayer
+message RecurrentParameter {
+  // The dimension of the output (and usually hidden state) representation --
+  // must be explicitly set to non-zero.
+  optional uint32 num_output = 1 [ default = 0 ];
+
+  optional FillerParameter weight_filler = 2; // The filler for the weight
+  optional FillerParameter bias_filler = 3;   // The filler for the bias
+
+  // Whether to enable displaying debug_info in the unrolled recurrent net.
+  optional bool debug_info = 4 [ default = false ];
+
+  // Whether to add as additional inputs (bottoms) the initial hidden state
+  // blobs, and add as additional outputs (tops) the final timestep hidden state
+  // blobs.  The number of additional bottom/top blobs required depends on the
+  // recurrent architecture -- e.g., 1 for RNNs, 2 for LSTMs.
+  optional bool expose_hidden = 5 [ default = false ];
+}
+
+// Message that stores parameters used by ReductionLayer
+message ReductionParameter {
+  enum ReductionOp {
+    SUM = 1;
+    ASUM = 2;
+    SUMSQ = 3;
+    MEAN = 4;
+  }
+
+  optional ReductionOp operation = 1 [ default = SUM ]; // reduction operation
+
+  // The first axis to reduce to a scalar -- may be negative to index from the
+  // end (e.g., -1 for the last axis).
+  // (Currently, only reduction along ALL "tail" axes is supported; reduction
+  // of axis M through N, where N < num_axes - 1, is unsupported.)
+  // Suppose we have an n-axis bottom Blob with shape:
+  //     (d0, d1, d2, ..., d(m-1), dm, d(m+1), ..., d(n-1)).
+  // If axis == m, the output Blob will have shape
+  //     (d0, d1, d2, ..., d(m-1)),
+  // and the ReductionOp operation is performed (d0 * d1 * d2 * ... * d(m-1))
+  // times, each including (dm * d(m+1) * ... * d(n-1)) individual data.
+  // If axis == 0 (the default), the output Blob always has the empty shape
+  // (count 1), performing reduction across the entire input --
+  // often useful for creating new loss functions.
+  optional int32 axis = 2 [ default = 0 ];
+
+  optional float coeff = 3 [ default = 1.0 ]; // coefficient for output
+}
+
+// Message that stores parameters used by ReLULayer
+message ReLUParameter {
+  // Allow non-zero slope for negative inputs to speed up optimization
+  // Described in:
+  // Maas, A. L., Hannun, A. Y., & Ng, A. Y. (2013). Rectifier nonlinearities
+  // improve neural network acoustic models. In ICML Workshop on Deep Learning
+  // for Audio, Speech, and Language Processing.
+  optional float negative_slope = 1 [ default = 0 ];
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 2 [ default = DEFAULT ];
+}
+
+message ReshapeParameter {
+  // Specify the output dimensions. If some of the dimensions are set to 0,
+  // the corresponding dimension from the bottom layer is used (unchanged).
+  // Exactly one dimension may be set to -1, in which case its value is
+  // inferred from the count of the bottom blob and the remaining dimensions.
+  // For example, suppose we want to reshape a 2D blob "input" with shape 2 x 8:
+  //
+  //   layer {
+  //     type: "Reshape" bottom: "input" top: "output"
+  //     reshape_param { ... }
+  //   }
+  //
+  // If "input" is 2D with shape 2 x 8, then the following reshape_param
+  // specifications are all equivalent, producing a 3D blob "output" with shape
+  // 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim:  2  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim:  4 } }
+  //   reshape_param { shape { dim:  0  dim: 2  dim: -1 } }
+  //   reshape_param { shape { dim:  0  dim:-1  dim:  4 } }
+  //
+  optional BlobShape shape = 1;
+
+  // axis and num_axes control the portion of the bottom blob's shape that are
+  // replaced by (included in) the reshape. By default (axis == 0 and
+  // num_axes == -1), the entire bottom blob shape is included in the reshape,
+  // and hence the shape field must specify the entire output shape.
+  //
+  // axis may be non-zero to retain some portion of the beginning of the input
+  // shape (and may be negative to index from the end; e.g., -1 to begin the
+  // reshape after the last axis, including nothing in the reshape,
+  // -2 to include only the last axis, etc.).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are all equivalent,
+  // producing a blob "output" with shape 2 x 2 x 4:
+  //
+  //   reshape_param { shape { dim: 2  dim: 2  dim: 4 } }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis:  1 }
+  //   reshape_param { shape { dim: 2  dim: 4 } axis: -3 }
+  //
+  // num_axes specifies the extent of the reshape.
+  // If num_axes >= 0 (and axis >= 0), the reshape will be performed only on
+  // input axes in the range [axis, axis+num_axes].
+  // num_axes may also be -1, the default, to include all remaining axes
+  // (starting from axis).
+  //
+  // For example, suppose "input" is a 2D blob with shape 2 x 8.
+  // Then the following ReshapeLayer specifications are equivalent,
+  // producing a blob "output" with shape 1 x 2 x 8.
+  //
+  //   reshape_param { shape { dim:  1  dim: 2  dim:  8 } }
+  //   reshape_param { shape { dim:  1  dim: 2  }  num_axes: 1 }
+  //   reshape_param { shape { dim:  1  }  num_axes: 0 }
+  //
+  // On the other hand, these would produce output blob shape 2 x 1 x 8:
+  //
+  //   reshape_param { shape { dim: 2  dim: 1  dim: 8  }  }
+  //   reshape_param { shape { dim: 1 }  axis: 1  num_axes: 0 }
+  //
+  optional int32 axis = 2 [ default = 0 ];
+  optional int32 num_axes = 3 [ default = -1 ];
+}
+
+message ScaleParameter {
+  // The first axis of bottom[0] (the first input Blob) along which to apply
+  // bottom[1] (the second input Blob).  May be negative to index from the end
+  // (e.g., -1 for the last axis).
+  //
+  // For example, if bottom[0] is 4D with shape 100x3x40x60, the output
+  // top[0] will have the same shape, and bottom[1] may have any of the
+  // following shapes (for the given value of axis):
+  //    (axis == 0 == -4) 100; 100x3; 100x3x40; 100x3x40x60
+  //    (axis == 1 == -3)          3;     3x40;     3x40x60
+  //    (axis == 2 == -2)                   40;       40x60
+  //    (axis == 3 == -1)                                60
+  // Furthermore, bottom[1] may have the empty shape (regardless of the value of
+  // "axis") -- a scalar multiplier.
+  optional int32 axis = 1 [ default = 1 ];
+
+  // (num_axes is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.  Otherwise, num_axes is determined by the
+  // number of axes by the second bottom.)
+  // The number of axes of the input (bottom[0]) covered by the scale
+  // parameter, or -1 to cover all axes of bottom[0] starting from `axis`.
+  // Set num_axes := 0, to multiply with a zero-axis Blob: a scalar.
+  optional int32 num_axes = 2 [ default = 1 ];
+
+  // (filler is ignored unless just one bottom is given and the scale is
+  // a learned parameter of the layer.)
+  // The initialization for the learned scale parameter.
+  // Default is the unit (1) initialization, resulting in the ScaleLayer
+  // initially performing the identity operation.
+  optional FillerParameter filler = 3;
+
+  // Whether to also learn a bias (equivalent to a ScaleLayer+BiasLayer, but
+  // may be more efficient).  Initialized with bias_filler (defaults to 0).
+  optional bool bias_term = 4 [ default = false ];
+  optional FillerParameter bias_filler = 5;
+}
+
+message SigmoidParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [ default = DEFAULT ];
+}
+
+message SliceParameter {
+  // The axis along which to slice -- may be negative to index from the end
+  // (e.g., -1 for the last axis).
+  // By default, SliceLayer concatenates blobs along the "channels" axis (1).
+  optional int32 axis = 3 [ default = 1 ];
+  repeated uint32 slice_point = 2;
+
+  // DEPRECATED: alias for "axis" -- does not support negative indexing.
+  optional uint32 slice_dim = 1 [ default = 1 ];
+}
+
+// Message that stores parameters used by SoftmaxLayer, SoftmaxWithLossLayer
+message SoftmaxParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [ default = DEFAULT ];
+
+  // The axis along which to perform the softmax -- may be negative to index
+  // from the end (e.g., -1 for the last axis).
+  // Any other axes will be evaluated as independent softmaxes.
+  optional int32 axis = 2 [ default = 1 ];
+}
+
+message TanHParameter {
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 1 [ default = DEFAULT ];
+}
+
+// Message that stores parameters used by TileLayer
+message TileParameter {
+  // The index of the axis to tile.
+  optional int32 axis = 1 [ default = 1 ];
+
+  // The number of copies (tiles) of the blob to output.
+  optional int32 tiles = 2;
+}
+
+// Message that stores parameters used by ThresholdLayer
+message ThresholdParameter {
+  optional float threshold = 1 [ default = 0 ]; // Strictly positive values
+}
+
+message WindowDataParameter {
+  // Specify the data source.
+  optional string source = 1;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 2 [ default = 1 ];
+  optional string mean_file = 3;
+  // Specify the batch size.
+  optional uint32 batch_size = 4;
+  // Specify if we would like to randomly crop an image.
+  optional uint32 crop_size = 5 [ default = 0 ];
+  // Specify if we want to randomly mirror data.
+  optional bool mirror = 6 [ default = false ];
+  // Foreground (object) overlap threshold
+  optional float fg_threshold = 7 [ default = 0.5 ];
+  // Background (non-object) overlap threshold
+  optional float bg_threshold = 8 [ default = 0.5 ];
+  // Fraction of batch that should be foreground objects
+  optional float fg_fraction = 9 [ default = 0.25 ];
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 context_pad = 10 [ default = 0 ];
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string crop_mode = 11 [ default = "warp" ];
+  // cache_images: will load all images in memory for faster access
+  optional bool cache_images = 12 [ default = false ];
+  // append root_folder to locate images
+  optional string root_folder = 13 [ default = "" ];
+}
+
+message SPPParameter {
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional uint32 pyramid_height = 1;
+  optional PoolMethod pool = 2 [ default = MAX ]; // The pooling method
+  enum Engine {
+    DEFAULT = 0;
+    CAFFE = 1;
+    CUDNN = 2;
+  }
+  optional Engine engine = 6 [ default = DEFAULT ];
+}
+
+// DEPRECATED: use LayerParameter.
+message V1LayerParameter {
+  repeated string bottom = 2;
+  repeated string top = 3;
+  optional string name = 4;
+  repeated NetStateRule include = 32;
+  repeated NetStateRule exclude = 33;
+  enum LayerType {
+    NONE = 0;
+    ABSVAL = 35;
+    ACCURACY = 1;
+    ARGMAX = 30;
+    BNLL = 2;
+    CONCAT = 3;
+    CONTRASTIVE_LOSS = 37;
+    CONVOLUTION = 4;
+    DATA = 5;
+    DECONVOLUTION = 39;
+    DROPOUT = 6;
+    DUMMY_DATA = 32;
+    EUCLIDEAN_LOSS = 7;
+    ELTWISE = 25;
+    EXP = 38;
+    FLATTEN = 8;
+    HDF5_DATA = 9;
+    HDF5_OUTPUT = 10;
+    HINGE_LOSS = 28;
+    IM2COL = 11;
+    IMAGE_DATA = 12;
+    INFOGAIN_LOSS = 13;
+    INNER_PRODUCT = 14;
+    LRN = 15;
+    MEMORY_DATA = 29;
+    MULTINOMIAL_LOGISTIC_LOSS = 16;
+    MVN = 34;
+    POOLING = 17;
+    POWER = 26;
+    RELU = 18;
+    SIGMOID = 19;
+    SIGMOID_CROSS_ENTROPY_LOSS = 27;
+    SILENCE = 36;
+    SOFTMAX = 20;
+    SOFTMAX_LOSS = 21;
+    SPLIT = 22;
+    SLICE = 33;
+    TANH = 23;
+    WINDOW_DATA = 24;
+    THRESHOLD = 31;
+  }
+  optional LayerType type = 5;
+  repeated BlobProto blobs = 6;
+  repeated string param = 1001;
+  repeated DimCheckMode blob_share_mode = 1002;
+  enum DimCheckMode {
+    STRICT = 0;
+    PERMISSIVE = 1;
+  }
+  repeated float blobs_lr = 7;
+  repeated float weight_decay = 8;
+  repeated float loss_weight = 35;
+  optional AccuracyParameter accuracy_param = 27;
+  optional ArgMaxParameter argmax_param = 23;
+  optional ConcatParameter concat_param = 9;
+  optional ContrastiveLossParameter contrastive_loss_param = 40;
+  optional ConvolutionParameter convolution_param = 10;
+  optional DataParameter data_param = 11;
+  optional DropoutParameter dropout_param = 12;
+  optional DummyDataParameter dummy_data_param = 26;
+  optional EltwiseParameter eltwise_param = 24;
+  optional ExpParameter exp_param = 41;
+  optional HDF5DataParameter hdf5_data_param = 13;
+  optional HDF5OutputParameter hdf5_output_param = 14;
+  optional HingeLossParameter hinge_loss_param = 29;
+  optional ImageDataParameter image_data_param = 15;
+  optional InfogainLossParameter infogain_loss_param = 16;
+  optional InnerProductParameter inner_product_param = 17;
+  optional LRNParameter lrn_param = 18;
+  optional MemoryDataParameter memory_data_param = 22;
+  optional MVNParameter mvn_param = 34;
+  optional PoolingParameter pooling_param = 19;
+  optional PowerParameter power_param = 21;
+  optional ReLUParameter relu_param = 30;
+  optional SigmoidParameter sigmoid_param = 38;
+  optional SoftmaxParameter softmax_param = 39;
+  optional SliceParameter slice_param = 31;
+  optional TanHParameter tanh_param = 37;
+  optional ThresholdParameter threshold_param = 25;
+  optional WindowDataParameter window_data_param = 20;
+  optional TransformationParameter transform_param = 36;
+  optional LossParameter loss_param = 42;
+  optional V0LayerParameter layer = 1;
+}
+
+// DEPRECATED: V0LayerParameter is the old way of specifying layer parameters
+// in Caffe.  We keep this message type around for legacy support.
+message V0LayerParameter {
+  optional string name = 1; // the layer name
+  optional string type = 2; // the string to specify the layer type
+
+  // Parameters to specify layers with inner products.
+  optional uint32 num_output = 3; // The number of outputs for the layer
+  optional bool biasterm = 4 [ default = true ]; // whether to have bias terms
+  optional FillerParameter weight_filler = 5;    // The filler for the weight
+  optional FillerParameter bias_filler = 6;      // The filler for the bias
+
+  optional uint32 pad = 7 [ default = 0 ];     // The padding size
+  optional uint32 kernelsize = 8;              // The kernel size
+  optional uint32 group = 9 [ default = 1 ];   // The group size for group conv
+  optional uint32 stride = 10 [ default = 1 ]; // The stride
+  enum PoolMethod {
+    MAX = 0;
+    AVE = 1;
+    STOCHASTIC = 2;
+  }
+  optional PoolMethod pool = 11 [ default = MAX ];     // The pooling method
+  optional float dropout_ratio = 12 [ default = 0.5 ]; // dropout ratio
+
+  optional uint32 local_size = 13 [ default = 5 ]; // for local response norm
+  optional float alpha = 14 [ default = 1. ];      // for local response norm
+  optional float beta = 15 [ default = 0.75 ];     // for local response norm
+  optional float k = 22 [ default = 1. ];
+
+  // For data layers, specify the data source
+  optional string source = 16;
+  // For data pre-processing, we can do simple scaling and subtracting the
+  // data mean, if provided. Note that the mean subtraction is always carried
+  // out before scaling.
+  optional float scale = 17 [ default = 1 ];
+  optional string meanfile = 18;
+  // For data layers, specify the batch size.
+  optional uint32 batchsize = 19;
+  // For data layers, specify if we would like to randomly crop an image.
+  optional uint32 cropsize = 20 [ default = 0 ];
+  // For data layers, specify if we want to randomly mirror data.
+  optional bool mirror = 21 [ default = false ];
+
+  // The blobs containing the numeric parameters of the layer
+  repeated BlobProto blobs = 50;
+  // The ratio that is multiplied on the global learning rate. If you want to
+  // set the learning ratio for one blob, you need to set it for all blobs.
+  repeated float blobs_lr = 51;
+  // The weight decay that is multiplied on the global weight decay.
+  repeated float weight_decay = 52;
+
+  // The rand_skip variable is for the data layer to skip a few data points
+  // to avoid all asynchronous sgd clients to start at the same point. The skip
+  // point would be set as rand_skip * rand(0,1). Note that rand_skip should not
+  // be larger than the number of keys in the database.
+  optional uint32 rand_skip = 53 [ default = 0 ];
+
+  // Fields related to detection (det_*)
+  // foreground (object) overlap threshold
+  optional float det_fg_threshold = 54 [ default = 0.5 ];
+  // background (non-object) overlap threshold
+  optional float det_bg_threshold = 55 [ default = 0.5 ];
+  // Fraction of batch that should be foreground objects
+  optional float det_fg_fraction = 56 [ default = 0.25 ];
+
+  // optional bool OBSOLETE_can_clobber = 57 [default = true];
+
+  // Amount of contextual padding to add around a window
+  // (used only by the window_data_layer)
+  optional uint32 det_context_pad = 58 [ default = 0 ];
+
+  // Mode for cropping out a detection window
+  // warp: cropped window is warped to a fixed size and aspect ratio
+  // square: the tightest square around the window is cropped
+  optional string det_crop_mode = 59 [ default = "warp" ];
+
+  // For ReshapeLayer, one needs to specify the new dimensions.
+  optional int32 new_num = 60 [ default = 0 ];
+  optional int32 new_channels = 61 [ default = 0 ];
+  optional int32 new_height = 62 [ default = 0 ];
+  optional int32 new_width = 63 [ default = 0 ];
+
+  // Whether or not ImageLayer should shuffle the list of files at every epoch.
+  // It will also resize images if new_height or new_width are not zero.
+  optional bool shuffle_images = 64 [ default = false ];
+
+  // For ConcatLayer, one needs to specify the dimension for concatenation, and
+  // the other dimensions must be the same for all the bottom blobs.
+  // By default it will concatenate blobs along the channels dimension.
+  optional uint32 concat_dim = 65 [ default = 1 ];
+
+  optional HDF5OutputParameter hdf5_output_param = 1001;
+}
+
+message PReLUParameter {
+  // Parametric ReLU described in K. He et al, Delving Deep into Rectifiers:
+  // Surpassing Human-Level Performance on ImageNet Classification, 2015.
+
+  // Initial value of a_i. Default is a_i=0.25 for all i.
+  optional FillerParameter filler = 1;
+  // Whether or not slope parameters are shared across channels.
+  optional bool channel_shared = 2 [ default = false ];
+}
diff --git a/fluid/image_classification/caffe2fluid/proto/compile.sh b/fluid/image_classification/caffe2fluid/proto/compile.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f621e0066d11595bc48362ad7411eeab57f035dd
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/proto/compile.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+#function:
+#   script used to generate caffepb.py from caffe.proto using protoc
+#
+
+PROTOC=`which protoc`
+if [[ -z $PROTOC ]];then
+    echo "not found protoc, you should first install it following this[https://github.com/google/protobuf/releases]"
+    exit 1
+fi
+
+WORK_ROOT=$(dirname `readlink -f "$BASH_SOURCE[0]"`)
+PY_NAME="$WORK_ROOT/caffepb.py"
+$PROTOC --proto_path=$WORK_ROOT --python_out=$WORK_ROOT $WORK_ROOT/caffe.proto
+ret=$?
+
+if [ $ret -eq 0 ];then
+    mv $WORK_ROOT/caffe_pb2.py $PY_NAME
+fi
+
+if [ -e "$PY_NAME" ];then
+    echo "succeed to generate [$PY_NAME]"
+    exit 0
+else
+    echo "failed to generate [$PY_NAME]"
+fi
+exit $ret
diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/README.md b/fluid/image_classification/caffe2fluid/tests/lenet/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..982edc2aa67f43f849bb2523b1a15edaa02f5d28
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/tests/lenet/README.md
@@ -0,0 +1,28 @@
+### Convert lenet model from caffe format into paddle format(fluid api)
+
+### Howto
+1, Prepare your caffepb.py
+
+2, Download a lenet caffe-model
+    lenet_iter_10000.caffemodel
+        download address: https://github.com/ethereon/caffe-tensorflow/raw/master/examples/mnist/lenet_iter_10000.caffemodel
+        md5: cbec75c1c374b6c1981c4a1eb024ae01  
+
+    lenet.prototxt
+        download address: https://raw.githubusercontent.com/BVLC/caffe/master/examples/mnist/lenet.prototxt
+        md5: 27384af843338ab90b00c8d1c81de7d5
+
+
+2, Convert this model(make sure caffepb.py is ready in ../../proto)
+    convert to npy format
+        bash ./convert.sh lenet.prototxt lenet.caffemodel lenet.py lenet.npy
+
+    save to fluid format(optional)
+        bash ./convert.sh lenet.prototxt lenet.caffemodel lenet.py lenet.npy && python ./lenet.py ./lenet.npy ./fluid.model
+
+4, Use this new model(paddle installed in this python)
+    use fluid format
+        python ./predict.py ./fluid.model
+
+    use npy format
+        python ./predict.py ./lenet.npy
diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/convert.sh b/fluid/image_classification/caffe2fluid/tests/lenet/convert.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b3ec1a1dce2434a4466cf5d4609de1b4aec9d346
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/tests/lenet/convert.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+#function:
+#   convert a caffe model
+#   eg:
+#       bash ./convert.sh ./model.caffe/lenet.prototxt ./model.caffe/lenet.caffemodel lenet.py lenet.npy
+
+if [[ $# -ne 4 ]];then
+    echo "usage:"
+    echo "  bash $0 [PROTOTXT] [CAFFEMODEL] [PY_NAME] [WEIGHT_NAME]"
+    echo "  eg: bash $0 lenet.prototxt lenet.caffemodel lenet.py lenet.npy"
+    exit 1
+fi
+
+WORK_ROOT=$(dirname `readlink -f ${BASH_SOURCE[0]}`)
+if [[ -z $PYTHON ]];then
+    PYTHON=`which python`
+fi
+
+PROTOTXT=$1
+CAFFEMODEL=$2
+PY_NAME=$3
+WEIGHT_NAME=$4
+CONVERTER_PY="$WORK_ROOT/../../convert.py"
+
+$PYTHON $CONVERTER_PY $PROTOTXT --caffemodel $CAFFEMODEL --code-output-path=$PY_NAME --data-output-path=$WEIGHT_NAME
+ret=$?
+if [[ $ret -eq 0 ]];then
+    echo "succeed to convert caffe model[$CAFFEMODEL, $PROTOTXT] to paddle model[$PY_NAME, $WEIGHT_NAME]"
+else
+    echo "failed to convert caffe model[$CAFFEMODEL, $PROTOTXT]"
+fi
+exit $ret
diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/lenet.npy b/fluid/image_classification/caffe2fluid/tests/lenet/lenet.npy
new file mode 100644
index 0000000000000000000000000000000000000000..66f773e5ffd54c8f5151b920aecdf3dd4f8c91d2
Binary files /dev/null and b/fluid/image_classification/caffe2fluid/tests/lenet/lenet.npy differ
diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/lenet.py b/fluid/image_classification/caffe2fluid/tests/lenet/lenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..50e6927483a61c574f1152c6dc438a6b2c8a4d90
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/tests/lenet/lenet.py
@@ -0,0 +1,297 @@
+### generated by caffe2fluid, your net is in class "LeNet" ###
+
+import math
+import os
+import numpy as np
+
+
+def import_fluid():
+    import paddle.v2.fluid as fluid
+    return fluid
+
+
+def layer(op):
+    '''Decorator for composable network layers.'''
+
+    def layer_decorated(self, *args, **kwargs):
+        # Automatically set a name if not provided.
+        name = kwargs.setdefault('name', self.get_unique_name(op.__name__))
+        # Figure out the layer inputs.
+        if len(self.terminals) == 0:
+            raise RuntimeError('No input variables found for layer %s.' % name)
+        elif len(self.terminals) == 1:
+            layer_input = self.terminals[0]
+        else:
+            layer_input = list(self.terminals)
+        # Perform the operation and get the output.
+        layer_output = op(self, layer_input, *args, **kwargs)
+        # Add to layer LUT.
+        self.layers[name] = layer_output
+        # This output is now the input for the next layer.
+        self.feed(layer_output)
+        # Return self for chained calls.
+        return self
+
+    return layer_decorated
+
+
+class Network(object):
+    def __init__(self, inputs, trainable=True):
+        # The input nodes for this network
+        self.inputs = inputs
+        # The current list of terminal nodes
+        self.terminals = []
+        # Mapping from layer names to layers
+        self.layers = dict(inputs)
+        # If true, the resulting variables are set as trainable
+        self.trainable = trainable
+        # Switch variable for dropout
+        self.paddle_env = None
+        self.setup()
+
+    def setup(self):
+        '''Construct the network. '''
+        raise NotImplementedError('Must be implemented by the subclass.')
+
+    def load(self, data_path, exe=None, place=None, ignore_missing=False):
+        '''Load network weights.
+        data_path: The path to the numpy-serialized network weights
+        ignore_missing: If true, serialized weights for missing layers are ignored.
+        '''
+        fluid = import_fluid()
+        #load fluid mode directly
+        if os.path.isdir(data_path):
+            assert (exe is not None), \
+                'must provide a executor to load fluid model'
+            fluid.io.load_persistables_if_exist(executor=exe, dirname=data_path)
+            return True
+
+        #load model from a npy file
+        if exe is None or place is None:
+            if self.paddle_env is None:
+                place = fluid.CPUPlace()
+                exe = fluid.Executor(place)
+                self.paddle_env = {'place': place, 'exe': exe}
+                exe = exe.run(fluid.default_startup_program())
+            else:
+                place = self.paddle_env['place']
+                exe = self.paddle_env['exe']
+
+        data_dict = np.load(data_path).item()
+        for op_name in data_dict:
+            layer = self.layers[op_name]
+            for param_name, data in data_dict[op_name].iteritems():
+                try:
+                    name = '%s_%s' % (op_name, param_name)
+                    v = fluid.global_scope().find_var(name)
+                    w = v.get_tensor()
+                    w.set(data, place)
+                except ValueError:
+                    if not ignore_missing:
+                        raise
+        return True
+
+    def feed(self, *args):
+        '''Set the input(s) for the next operation by replacing the terminal nodes.
+        The arguments can be either layer names or the actual layers.
+        '''
+        assert len(args) != 0
+        self.terminals = []
+        for fed_layer in args:
+            if isinstance(fed_layer, basestring):
+                try:
+                    fed_layer = self.layers[fed_layer]
+                except KeyError:
+                    raise KeyError('Unknown layer name fed: %s' % fed_layer)
+            self.terminals.append(fed_layer)
+        return self
+
+    def get_output(self):
+        '''Returns the current network output.'''
+        return self.terminals[-1]
+
+    def get_unique_name(self, prefix):
+        '''Returns an index-suffixed unique name for the given prefix.
+        This is used for auto-generating layer names based on the type-prefix.
+        '''
+        ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1
+        return '%s_%d' % (prefix, ident)
+
+    @layer
+    def conv(self,
+             input,
+             k_h,
+             k_w,
+             c_o,
+             s_h,
+             s_w,
+             name,
+             relu=True,
+             padding=None,
+             group=1,
+             biased=True):
+        if padding is None:
+            padding = [0, 0]
+
+        # Get the number of channels in the input
+        c_i, h_i, w_i = input.shape[1:]
+
+        # Verify that the grouping parameter is valid
+        assert c_i % group == 0
+        assert c_o % group == 0
+
+        fluid = import_fluid()
+        prefix = name + '_'
+        output = fluid.layers.conv2d(
+            input=input,
+            filter_size=[k_h, k_w],
+            num_filters=c_o,
+            stride=[s_h, s_w],
+            padding=padding,
+            groups=group,
+            param_attr=fluid.ParamAttr(name=prefix + "weights"),
+            bias_attr=fluid.ParamAttr(name=prefix + "biases"),
+            act="relu" if relu is True else None)
+        return output
+
+    @layer
+    def relu(self, input, name):
+        fluid = import_fluid()
+        output = fluid.layers.relu(x=input)
+        return output
+
+    @layer
+    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
+        if padding is None:
+            padding = [0, 0]
+
+        # Get the number of channels in the input
+        h_i, w_i = input.shape[2:]
+        fluid = import_fluid()
+        output = fluid.layers.pool2d(
+            input=input,
+            pool_size=[k_h, k_w],
+            pool_stride=[s_h, s_w],
+            pool_padding=padding,
+            pool_type='max')
+        return output
+
+    @layer
+    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
+        if padding is None:
+            padding = [0, 0]
+
+        # Get the number of channels in the input
+        h_i, w_i = input.shape[2:]
+        fluid = import_fluid()
+        output = fluid.layers.pool2d(
+            input=input,
+            pool_size=[k_h, k_w],
+            pool_stride=[s_h, s_w],
+            pool_padding=padding,
+            pool_type='avg')
+        return output
+
+    @layer
+    def lrn(self, input, radius, alpha, beta, name, bias=1.0):
+        raise Exception('lrn() not implemented yet')
+
+    @layer
+    def concat(self, inputs, axis, name):
+        fluid = import_fluid()
+        output = fluid.layers.concat(input=inputs, axis=axis)
+        return output
+
+    @layer
+    def add(self, inputs, name):
+        fluid = import_fluid()
+        output = inputs[0]
+        for i in inputs[1:]:
+            output = fluid.layers.elementwise_add(x=output, y=i)
+        return output
+
+    @layer
+    def fc(self, input, num_out, name, relu=True, act=None):
+        fluid = import_fluid()
+
+        if act is None:
+            act = 'relu' if relu is True else None
+
+        prefix = name + '_'
+        output = fluid.layers.fc(
+            name=name,
+            input=input,
+            size=num_out,
+            act=act,
+            param_attr=fluid.ParamAttr(name=prefix + 'weights'),
+            bias_attr=fluid.ParamAttr(name=prefix + 'biases'))
+        return output
+
+    @layer
+    def softmax(self, input, name):
+        fluid = import_fluid()
+        output = fluid.layers.softmax(x=input, name=name)
+        return output
+
+    @layer
+    def batch_normalization(self, input, name, scale_offset=True, relu=False):
+        # NOTE: Currently, only inference is supported
+        fluid = import_fluid()
+        prefix = name + '_'
+        param_attr = None if scale_offset is False else fluid.ParamAttr(
+            name=prefix + 'scale')
+        bias_attr = None if scale_offset is False else fluid.ParamAttr(
+            name=prefix + 'offset')
+        mean_name = prefix + 'mean'
+        variance_name = prefix + 'variance'
+        output = fluid.layers.batch_norm(
+            name=name,
+            input=input,
+            is_test=True,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            moving_mean_name=mean_name,
+            moving_variance_name=variance_name,
+            epsilon=1e-5,
+            act='relu' if relu is True else None)
+
+        return output
+
+    @layer
+    def dropout(self, input, keep_prob, name):
+        raise Exception('dropout() not implemented yet')
+
+
+class LeNet(Network):
+    def setup(self):
+        self.feed('data')
+        self.conv(5, 5, 20, 1, 1, relu=False, name='conv1')
+        self.max_pool(2, 2, 2, 2, name='pool1')
+        self.conv(5, 5, 50, 1, 1, relu=False, name='conv2')
+        self.max_pool(2, 2, 2, 2, name='pool2')
+        self.fc(500, name='ip1')
+        self.fc(10, relu=False, name='ip2')
+        self.softmax(name='prob')
+
+    @classmethod
+    def convert(cls, npy_model, fluid_path):
+        import paddle.v2.fluid as fluid
+        data_layer = fluid.layers.data(
+            name="data", shape=[1, 28, 28], dtype="float32")
+        feed_data = {"data": data_layer}
+        net = cls(feed_data)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        net.load(data_path=npy_model, exe=exe, place=place)
+        fluid.io.save_persistables(executor=exe, dirname=fluid_path)
+
+
+if __name__ == "__main__":
+    #usage: python xxxnet.py xxx.npy ./model
+
+    import sys
+    npy_weight = sys.argv[1]
+    fluid_model = sys.argv[2]
+    LeNet.convert(npy_weight, fluid_model)
+    exit(0)
diff --git a/fluid/image_classification/caffe2fluid/tests/lenet/predict.py b/fluid/image_classification/caffe2fluid/tests/lenet/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..7405cc6f848ea139bc4edd4c3ec0e0af773ea25a
--- /dev/null
+++ b/fluid/image_classification/caffe2fluid/tests/lenet/predict.py
@@ -0,0 +1,74 @@
+#!/bin/env python
+
+#function:
+#   demo to show how to use converted model using caffe2fluid
+#
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+from lenet import LeNet as MyNet
+
+
+def test_model(exe, test_program, fetch_list, test_reader, feeder):
+    acc_set = []
+
+    for data in test_reader():
+        acc_np, pred = exe.run(program=test_program,
+                               feed=feeder.feed(data),
+                               fetch_list=fetch_list)
+        acc_set.append(float(acc_np))
+
+    acc_val = np.array(acc_set).mean()
+    return float(acc_val)
+
+
+def main(model_path):
+    """ main
+    """
+    print('load fluid model in %s' % (model_path))
+
+    with_gpu = False
+    paddle.init(use_gpu=with_gpu)
+
+    #1, define network topology
+    images = fluid.layers.data(name='image', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    net = MyNet({'data': images})
+    prediction = net.layers['prob']
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    place = fluid.CUDAPlace(0) if with_gpu is True else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    #2, load weights
+    if model_path.find('.npy') > 0:
+        net.load(data_path=model_path, exe=exe, place=place)
+    else:
+        net.load(data_path=model_path, exe=exe)
+
+    #3, test this model
+    test_program = fluid.default_main_program().clone()
+    test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
+
+    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+    fetch_list = [acc, prediction]
+
+    print('go to test model using test set')
+    acc_val = test_model(exe, test_program, \
+            fetch_list, test_reader, feeder)
+
+    print('test accuracy is [%.4f], expected value[0.919]' % (acc_val))
+
+
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) == 2:
+        fluid_model_path = sys.argv[1]
+    else:
+        fluid_model_path = './model.fluid'
+
+    main(fluid_model_path)
diff --git a/mt_with_external_memory/README.md b/mt_with_external_memory/README.md
index 1b478bd846ec5a5083c877f15c86057014375f8a..6643b4eb6c530c9fcaaf435ae999fc03eb628838 100644
--- a/mt_with_external_memory/README.md
+++ b/mt_with_external_memory/README.md
@@ -116,7 +116,7 @@
 算法实现于以下几个文件中：
 
 - `external_memory.py`: 主要实现简化版的 **神经图灵机** 于 `ExternalMemory` 类，对外提供初始化和读写函数。
-- `model.py`: 相关模型配置函数，包括双向 GPU 编码器（`bidirectional_gru_encoder`），带外部记忆强化的解码器（`memory_enhanced_decoder`），带外部记忆强化的序列到序列模型（`memory_enhanced_decoder`）。
+- `model.py`: 相关模型配置函数，包括双向 GPU 编码器（`bidirectional_gru_encoder`），带外部记忆强化的解码器（`memory_enhanced_decoder`），带外部记忆强化的序列到序列模型（`memory_enhanced_seq2seq`）。
 - `data_utils.py`: 相关数据处理辅助函数。
 - `train.py`: 模型训练。
 - `infer.py`: 部分示例样本的翻译（模型推断）。
@@ -170,6 +170,7 @@ class ExternalMemory(object):
                                      a learnable gate function.
         :type enable_interpolation: bool
         """
+        pass
 
     def _content_addressing(self, key_vector):
         """Get write/read head's addressing weights via content-based addressing.
@@ -194,6 +195,7 @@ class ExternalMemory(object):
         :param write_key: Key vector for write heads to generate writing
                           content and addressing signals.
         :type write_key: LayerOutput
+        """
         pass
 
     def read(self, read_key):
@@ -410,7 +412,7 @@ paddle.dataset.wmt14.test(dict_size)
 命令行输入：
 
 ```bash
-python mt_with_external_memory.py
+python train.py
 ```
 或自定义部分参数, 例如: