diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0c32e26092f6ea25771279418582a24ea449ab2
--- /dev/null
+++ b/python/paddle/distributed/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
new file mode 100644
index 0000000000000000000000000000000000000000..03c4078775d455fdb19aaf78ace4dcb98c8dd66a
--- /dev/null
+++ b/python/paddle/distributed/launch.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import subprocess
+import os
+import sys
+import time
+import argparse
+
+default_envs = {
+    "PADDLE_TRAINER_ENDPOINTS":
+    "127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",
+    "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+    "PATH": os.getenv("PATH"),
+    "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+    "PADDLE_TRAINERS_NUM": "8",
+    "NCCL_DEBUG": "INFO",
+    "GLOG_v": "0",
+    "NCCL_SOCKET_IFNAME": "eth0",
+    "NCCL_IB_GID_INDEX": "3",
+    "NCCL_IB_RETRY_CNT": "0",
+}
+
+GPUS = 8
+
+
+def start_procs(gpus, entrypoint, entrypoint_args, log_dir):
+    procs = []
+    log_fns = []
+    os.system("mkdir -p %s" % log_dir)
+    # ======== update parent envs =======
+    for k, v in os.environ.items():
+        if k.startswith("FLAGS_") or k.startswith("NCCL_") or \
+            k.startswith("GLOG_"):
+            default_envs[k] = v
+
+    # ======== for dist training =======
+    node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+    current_ip = os.getenv("POD_IP", "127.0.0.1")
+    trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")
+    num_nodes = len(trainer_ips)
+    all_nodes_devices_endpoints = ""
+    for n in trainer_ips:
+        for i in range(gpus):
+            if all_nodes_devices_endpoints:
+                all_nodes_devices_endpoints += ","
+            all_nodes_devices_endpoints += "%s:617%d" % (n, i)
+    nranks = num_nodes * gpus
+    # ======== for dist training =======
+
+    for i in range(gpus):
+        curr_env = {}
+        curr_env.update(default_envs)
+        curr_env.update({
+            "FLAGS_selected_gpus": "%d" % i,
+            "PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
+            "PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
+            # nranks
+            "PADDLE_TRAINERS_NUM": "%d" % nranks,
+            "PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
+        })
+
+        print("starting process ", i, entrypoint, entrypoint_args, curr_env)
+        fn = open("%s/workerlog.%d" % (log_dir, i), "w")
+        log_fns.append(fn)
+        cmd = [sys.executable, "-u", entrypoint] + entrypoint_args
+        procs.append(subprocess.Popen(cmd, stdout=fn, stderr=fn, env=curr_env))
+
+    for i in range(gpus):
+        try:
+            procs[i].communicate()
+            procs[i].terminate()
+            log_fns[i].close()
+        except:
+            pass
+
+
+def parse_args():
+
+    parser = argparse.ArgumentParser(
+        description='''start paddle training using multi-process mode.
+NOTE: your train program ***must*** run as distributed nccl2 mode,
+see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
+And your train program must read environment variables below in order to let different
+process init properly:
+FLAGS_selected_gpus
+PADDLE_TRAINER_ID
+PADDLE_CURRENT_ENDPOINT
+PADDLE_TRAINERS_NUM
+PADDLE_TRAINER_ENDPOINTS
+POD_IP (current node ip address, not needed for local training)
+''')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=8,
+        help='start number of processes for every gpu')
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default="mylog",
+        help='directory to put logs per process.')
+    parser.add_argument(
+        'entrypoint_script',
+        type=str,
+        help="The entrypoint script to be launched in parallel,"
+        "followed by all the arguments for each process,"
+        "e.g. train.py --lr 0.1")
+    parser.add_argument('entrypoint_args', nargs=argparse.REMAINDER)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # launch multiple training process
+    start_procs(args.gpus, args.entrypoint_script, args.entrypoint_args,
+                args.log_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0f480e34dcac3351ba3008ad632a29943afdb81
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+os.environ[
+    'RECORDIO_FILENAME'] = '/tmp/ir_memory_optimize_transformer.wmt16.recordio'
+
+from test_parallel_executor_transformer import TestTransformer
+from test_parallel_executor_transformer import transformer
+
+
+# NOTE(dzhwinter): test diferent strategy colisions.
+# open the eager delete tensor strategy by default.
+class TestTransformerWithIR(TestTransformer):
+    def test_main(self):
+        if core.is_compiled_with_cuda():
+            # check python transpiler
+            self.check_network_convergence(
+                transformer,
+                use_cuda=True,
+                memory_opt=True,
+                use_ir_memory_optimize=False)
+            # check IR memory optimize
+            self.check_network_convergence(
+                transformer,
+                use_cuda=True,
+                memory_opt=False,
+                use_ir_memory_optimize=True)
+
+
+if __name__ == '__main__':
+    unittest.main()