diff --git a/CMakeLists.txt b/CMakeLists.txt
index c649aafeddaf9f28c213d086236c3779d3137d92..de47086dbd6a440cd413c7843c83b1c69d9841b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,7 @@ option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_F
 option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
+option(WITH_TENSORRT    "Compile PaddlePaddle with TensorRT support."   OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -181,6 +182,11 @@ if(WITH_GPU)
     include(cuda)
 endif(WITH_GPU)
 
+# TensorRT depends on GPU.
+if (NOT WITH_GPU)
+  set(WITH_TENSORRT OFF)
+endif()
+
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
diff --git a/Dockerfile b/Dockerfile
index 0f13acabc3e5e1f6a46c5712ca6ad199266dd5ed..9097bb657d2366997112ec7662762a93358aa647 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,6 +45,13 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # install glide
 RUN curl -s -q https://glide.sh/get | sh
 
+# Install TensorRT
+# The unnecessary files has been removed to make the library small.
+RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+    tar -xz -C /usr/local && \
+    cp -rf /usr/local/TensorRT/include /usr && \
+    cp -rf /usr/local/TensorRT/lib /usr
+
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index aa249159470773241e0f6da2e8e086264634dd4a..e90948782bb5e333bbdb47ef9d61c1e37e3cf9e4 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -33,7 +33,7 @@ ExternalProject_Add(
     extern_grpc
     DEPENDS protobuf zlib
     GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.11.x"
+    GIT_TAG "v1.10.x"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 7066637a7cb27b83724cb4030c29a1019981f52b..0f9521616952a2857222feab8c38fb480761ee2d 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -3,7 +3,9 @@ add_custom_target(paddle_apis ALL
 
 add_custom_target(paddle_docs ALL
                   DEPENDS paddle_v2_docs paddle_v2_docs_cn
-                  paddle_fluid_docs paddle_fluid_docs_cn)
+                  paddle_fluid_docs paddle_fluid_docs_cn
+                  paddle_mobile_docs paddle_mobile_docs_cn)
 
 add_subdirectory(v2)
 add_subdirectory(fluid)
+add_subdirectory(mobile)
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
index 22e6fb13d7320986a60bc1ef5530187e0970c767..5c02886efd7d11e9520910526fb90ec01e123bae 100644
--- a/doc/fluid/api/layers.rst
+++ b/doc/fluid/api/layers.rst
@@ -473,6 +473,12 @@ multiplex
 ..  autofunction:: paddle.fluid.layers.multiplex
     :noindex:
 
+label_smooth
+------------
+
+..  autofunction:: paddle.fluid.layers.label_smooth
+    :noindex:
+
 ops
 ===
 
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
index b123b756e2251c38f319e1aefa2cb04fd7a36b03..ad798003f560e7fb0e6db6083fdd152fd3417584 100644
--- a/doc/fluid/dev/index_cn.rst
+++ b/doc/fluid/dev/index_cn.rst
@@ -4,6 +4,7 @@
 .. toctree::
   :maxdepth: 1
 
+  api_doc_std_cn.md
   new_op_cn.md
   new_op_kernel.md
   use_eigen_cn.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
index 98988fc22dcedecdbcd67fb3bf761377bf046337..80c899a82fa452c5cd8f38dad89c15d3041b09e3 100644
--- a/doc/fluid/dev/index_en.rst
+++ b/doc/fluid/dev/index_en.rst
@@ -4,6 +4,7 @@ Development
 .. toctree::
   :maxdepth: 1
 
+  api_doc_std_en.md
   new_op_en.md
   new_op_kernel.md
   use_eigen_en.md
diff --git a/doc/mobile/CMakeLists.txt b/doc/mobile/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b104a6318d474d6531670b8ac3569448774850c7
--- /dev/null
+++ b/doc/mobile/CMakeLists.txt
@@ -0,0 +1,53 @@
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+        "${BINARY_BUILD_DIR_EN}/conf.py"
+        @ONLY)
+
+sphinx_add_target(paddle_mobile_docs
+        html
+        ${BINARY_BUILD_DIR_EN}
+        ${SPHINX_CACHE_DIR_EN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_mobile_docs gen_proto_py paddle_python)
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+        "${BINARY_BUILD_DIR_CN}/conf.py"
+        @ONLY)
+
+sphinx_add_target(paddle_mobile_docs_cn
+        html
+        ${BINARY_BUILD_DIR_CN}
+        ${SPHINX_CACHE_DIR_CN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_CN})
+
+add_dependencies(paddle_mobile_docs_cn gen_proto_py paddle_python)
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8297316e8fbb2b8f41954030293feadbcd81295e
--- /dev/null
+++ b/doc/mobile/index_cn.rst
@@ -0,0 +1,9 @@
+移动端
+=====
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_cn.md
+  cross_compiling_for_ios_cn.md
+  cross_compiling_for_raspberry_cn.md
\ No newline at end of file
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e0acdff0284e3bc84b2cc4a34a142ee01754f940
--- /dev/null
+++ b/doc/mobile/index_en.rst
@@ -0,0 +1,9 @@
+Mobile
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
+  cross_compiling_for_raspberry_en.md
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index e0dd9e6068174a4b0348d503f4082bee6ff68dac..5a95cbc53625888bac539f91af391ff0babec17b 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -55,21 +55,21 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
   }
 }
 
-void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
+                                                const OpDesc &op,
                                                 const platform::Place &p,
                                                 const size_t &i) const {
   auto *op_handle = result->ops_.back().get();
-  op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
-      platform::DeviceContextPool::Instance().Get(p));
+  op_handle->dev_ctxes_[p] = platform::DeviceContextPool::Instance().Get(p);
 
-  auto var_names = op->InputArgumentNames();
+  auto var_names = op.InputArgumentNames();
 
   for (auto &each_var_name : var_names) {
     VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
     op_handle->AddInput(var);
   }
 
-  var_names = op->OutputArgumentNames();
+  var_names = op.OutputArgumentNames();
 
   for (auto &each_var_name : var_names) {
     CreateOpOutput(result, op_handle, each_var_name, p, i);
@@ -107,7 +107,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       result.ops_.emplace_back(new SendOpHandle(*op, s, p));
       // Create inputs for output on original place and no ssa output
       // is created for send op.
-      CreateOpHandleIOs(&result, op, p, 0);
+      CreateOpHandleIOs(&result, *op, p, 0);
       continue;
     }
 
@@ -117,7 +117,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
 
       result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
       auto *op_handle = result.ops_.back().get();
-      CreateOpHandleIOs(&result, op, p, i);
+      CreateOpHandleIOs(&result, *op, p, i);
 
       auto var_names = op->OutputArgumentNames();
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index de34caab1be85eecb741a5003f026eb982e178ea..f1518d75b421006db6311c3b0f602e47000ab381 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -45,8 +45,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
 
  private:
-  void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
-                         const size_t &i) const;
+  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
+                         const platform::Place &p, const size_t &i) const;
 
  private:
   std::string loss_var_name_;
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 66618a291b59996836e822587af618927a4263c7..6c46e9aad5b7fbf67fdcc07a12e7932ac8b6412b 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -66,7 +66,7 @@ TEST(ProgramDesc, copy_ctor) {
 
   for (size_t i = 0; i < global_block->OpSize(); ++i) {
     auto op_origin = global_block->Op(i);
-    auto op_copy = global_block->Op(i);
+    auto op_copy = global_block_copy->Op(i);
 
     ASSERT_EQ(op_origin->Type(), op_copy->Type());
     ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
@@ -131,7 +131,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
 
   for (size_t i = 0; i < global_block->OpSize(); ++i) {
     auto op_origin = global_block->Op(i);
-    auto op_restored = global_block->Op(i);
+    auto op_restored = global_block_restored->Op(i);
 
     ASSERT_EQ(op_origin->Type(), op_restored->Type());
     ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs());
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index e53bcf2384e54e21c7dd5638f3b7469a35b571bf..8494edee6c2c714c285c45bbb4fe1d8cb1a524aa 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -21,4 +21,7 @@ endif()
 
 if(WITH_TESTING)
   add_subdirectory(tests/book)
+  if (WITH_TENSORRT)
+    add_subdirectory(tensorrt)
+  endif()
 endif()
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e39c0daac76e0993382868289f66351da3d16f8f
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -0,0 +1 @@
+nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a81a708e7a79225fd52c4b8e081afdcd8fe7e9ad
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "NvInfer.h"
+#include "cuda.h"
+#include "cuda_runtime_api.h"
+#include "paddle/fluid/platform/dynload/tensorrt.h"
+
+namespace dy = paddle::platform::dynload;
+
+class Logger : public nvinfer1::ILogger {
+ public:
+  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+    switch (severity) {
+      case Severity::kINFO:
+        LOG(INFO) << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << msg;
+        break;
+      case Severity::kINTERNAL_ERROR:
+      case Severity::kERROR:
+        LOG(ERROR) << msg;
+        break;
+      default:
+        break;
+    }
+  }
+};
+
+class ScopedWeights {
+ public:
+  ScopedWeights(float value) : value_(value) {
+    w.type = nvinfer1::DataType::kFLOAT;
+    w.values = &value_;
+    w.count = 1;
+  }
+  const nvinfer1::Weights& get() { return w; }
+
+ private:
+  float value_;
+  nvinfer1::Weights w;
+};
+
+// The following two API are implemented in TensorRT's header file, cannot load
+// from the dynamic library. So create our own implementation and directly
+// trigger the method from the dynamic library.
+nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) {
+  return static_cast<nvinfer1::IBuilder*>(
+      dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) {
+  return static_cast<nvinfer1::IRuntime*>(
+      dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION));
+}
+
+const char* kInputTensor = "input";
+const char* kOutputTensor = "output";
+
+// Creates a network to compute y = 2x + 3
+nvinfer1::IHostMemory* CreateNetwork() {
+  Logger logger;
+  // Create the engine.
+  nvinfer1::IBuilder* builder = createInferBuilder(logger);
+  ScopedWeights weights(2.);
+  ScopedWeights bias(3.);
+
+  nvinfer1::INetworkDefinition* network = builder->createNetwork();
+  // Add the input
+  auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
+                                 nvinfer1::DimsCHW{1, 1, 1});
+  EXPECT_NE(input, nullptr);
+  // Add the hidden layer.
+  auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get());
+  EXPECT_NE(layer, nullptr);
+  // Mark the output.
+  auto output = layer->getOutput(0);
+  output->setName(kOutputTensor);
+  network->markOutput(*output);
+  // Build the engine.
+  builder->setMaxBatchSize(1);
+  builder->setMaxWorkspaceSize(1 << 10);
+  auto engine = builder->buildCudaEngine(*network);
+  EXPECT_NE(engine, nullptr);
+  // Serialize the engine to create a model, then close.
+  nvinfer1::IHostMemory* model = engine->serialize();
+  network->destroy();
+  engine->destroy();
+  builder->destroy();
+  return model;
+}
+
+void Execute(nvinfer1::IExecutionContext& context, const float* input,
+             float* output) {
+  const nvinfer1::ICudaEngine& engine = context.getEngine();
+  // Two binds, input and output
+  ASSERT_EQ(engine.getNbBindings(), 2);
+  const int input_index = engine.getBindingIndex(kInputTensor);
+  const int output_index = engine.getBindingIndex(kOutputTensor);
+  // Create GPU buffers and a stream
+  void* buffers[2];
+  ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float)));
+  ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float)));
+  cudaStream_t stream;
+  ASSERT_EQ(0, cudaStreamCreate(&stream));
+  // Copy the input to the GPU, execute the network, and copy the output back.
+  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
+                               cudaMemcpyHostToDevice, stream));
+  context.enqueue(1, buffers, stream, nullptr);
+  ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
+                               cudaMemcpyDeviceToHost, stream));
+  cudaStreamSynchronize(stream);
+
+  // Release the stream and the buffers
+  cudaStreamDestroy(stream);
+  ASSERT_EQ(0, cudaFree(buffers[input_index]));
+  ASSERT_EQ(0, cudaFree(buffers[output_index]));
+}
+
+TEST(TensorrtTest, BasicFunction) {
+  // Create the network serialized model.
+  nvinfer1::IHostMemory* model = CreateNetwork();
+
+  // Use the model to create an engine and an execution context.
+  Logger logger;
+  nvinfer1::IRuntime* runtime = createInferRuntime(logger);
+  nvinfer1::ICudaEngine* engine =
+      runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
+  model->destroy();
+  nvinfer1::IExecutionContext* context = engine->createExecutionContext();
+
+  // Execute the network.
+  float input = 1234;
+  float output;
+  Execute(*context, &input, &output);
+  EXPECT_EQ(output, input * 2 + 3);
+
+  // Destroy the engine.
+  context->destroy();
+  engine->destroy();
+  runtime->destroy();
+}
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 84dac2937de02b3374156ebc83e19dac9f9a3e7a..b93b925a72a55442c105e4280a3580f4ea5b93a1 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,6 +1,11 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+if (WITH_TENSORRT)
+  list(APPEND CUDA_SRCS tensorrt.cc)
+endif()
+
+
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 3c1ccc7445ed27c711ab250aa223c66ae0da45dc..19c01dc5a968c7e1d2b0f15cf9a0e8427004e58b 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -45,6 +45,10 @@ DEFINE_string(nccl_dir, "",
 
 DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
 
+DEFINE_string(
+    tensorrt_dir, "",
+    "Specify path for loading tensorrt library, such as libnvinfer.so.");
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -194,6 +198,14 @@ void* GetNCCLDsoHandle() {
 #endif
 }
 
+void* GetTensorRtDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
+#endif
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 4c85093a43e0e8d75b64c5b29d1ec68db1b44909..0de3559b6088086cb52c254535b6ec42da7dd724 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -25,6 +25,7 @@ void* GetCurandDsoHandle();
 void* GetWarpCTCDsoHandle();
 void* GetLapackDsoHandle();
 void* GetNCCLDsoHandle();
+void* GetTensorRtDsoHandle();
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3c8e27944ca9b6419de87d752df3a83751039b1
--- /dev/null
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/tensorrt.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag tensorrt_dso_flag;
+void *tensorrt_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
new file mode 100644
index 0000000000000000000000000000000000000000..f584a49da0fefe0b064b5fb55b01ec132225ce5e
--- /dev/null
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <NvInfer.h>
+#include <dlfcn.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag tensorrt_dso_flag;
+extern void* tensorrt_dso_handle;
+
+#ifdef PADDLE_USE_DSO
+
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
+  struct DynLoad__##__name {                                            \
+    template <typename... Args>                                         \
+    auto operator()(Args... args) -> decltype(__name(args...)) {        \
+      using tensorrt_func = decltype(__name(args...)) (*)(Args...);     \
+      std::call_once(tensorrt_dso_flag, []() {                          \
+        tensorrt_dso_handle =                                           \
+            paddle::platform::dynload::GetTensorRtDsoHandle();          \
+        PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
+      });                                                               \
+      void* p_##__name = dlsym(tensorrt_dso_handle, #__name);           \
+      PADDLE_ENFORCE(p_##__name, "load %s failed", #__name);            \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);      \
+    }                                                                   \
+  };                                                                    \
+  extern DynLoad__##__name __name
+
+#else
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
+  struct DynLoad__##__name {                       \
+    template <typename... Args>                    \
+    tensorrtResult_t operator()(Args... args) {    \
+      return __name(args...);                      \
+    }                                              \
+  };                                               \
+  extern DynLoad__##__name __name
+#endif
+
+#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
+  __macro(createInferBuilder_INTERNAL);     \
+  __macro(createInferRuntime_INTERNAL);
+
+TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index 673e1bcae4af6d039bc969f1de6e4bcab3748cb5..ffd183af68514dbb1a8b3de39000c9ca3f56ddc3 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -873,6 +873,11 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) {
   return !((isnan)(a)) && !((isinf)(a));
 }
 
+inline std::ostream& operator<<(std::ostream& os, const float16& a) {
+  os << static_cast<float>(a);
+  return os;
+}
+
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index d60aecf96c8828a5656f81fd3602cfb2e66990cf..a589e32b61a9b6a44bdc4529eee715d987d6922c 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -141,5 +141,10 @@ TEST(float16, lod_tensor_cpu) {
   }
 }
 
+TEST(float16, print) {
+  float16 a = float16(1.0f);
+  std::cout << a << std::endl;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/utils/DynamicLoader.cpp b/paddle/utils/DynamicLoader.cpp
index 5604a90038b06d2c1a4d9db70e4185cddfd25d3e..9ac4a56c6e300d299467630b39a32567af72cf40 100644
--- a/paddle/utils/DynamicLoader.cpp
+++ b/paddle/utils/DynamicLoader.cpp
@@ -32,6 +32,8 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
 DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
 
+DEFINE_string(tensorrt_dir, "", "Specify path for loading libnvinfer.so.");
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -157,3 +159,12 @@ void GetLapackDsoHandle(void** dso_handle) {
   GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
 #endif
 }
+
+void GetTensorRtDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(
+      FLAGS_tensorrt_dir, "libnvinfer.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so", dso_handle);
+#endif
+}
diff --git a/paddle/utils/DynamicLoader.h b/paddle/utils/DynamicLoader.h
index 2e5ff76a06152b6a12818f06baaeaa6a69726ba8..02f519de4b3988fb6aca323aaa1751ee2c4bd738 100644
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -58,3 +58,11 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  *
  */
 void GetLapackDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of tensorrt
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetTensorRtDsoHandle(void** dso_handle);
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 5c2c2dd7abebf8960d68b4c4dfd746a4e27acd03..bba8b64bd88c3edc6eda110dde38c0ced50439f6 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -77,6 +77,7 @@ __all__ = [
     'lod_reset',
     'lrn',
     'pad',
+    'label_smooth',
 ]
 
 
@@ -3678,3 +3679,68 @@ def pad(x, paddings, pad_value=0., name=None):
         attrs={'paddings': paddings,
                'pad_value': float(pad_value)})
     return out
+
+
+def label_smooth(label,
+                 prior_dist=None,
+                 epsilon=0.1,
+                 dtype="float32",
+                 name=None):
+    """
+    Label smoothing is a mechanism to regularize the classifier layer and is
+    called label-smoothing regularization (LSR). 
+    
+    Label smoothing is proposed to encourage the model to be less confident,
+    since optimizing the log-likelihood of the correct label directly may
+    cause overfitting and reduce the ability of the model to adapt. Label
+    smoothing replaces the ground-truth label :math:`y` with the weighted sum
+    of itself and some fixed distribution :math:`\mu`. For class :math:`k`,
+    i.e.
+
+    .. math::
+
+        \\tilde{y_k} = (1 - \epsilon) * y_k + \epsilon * \mu_k,
+
+    where :math:`1 - \epsilon` and :math:`\epsilon` are the weights
+    respectively, and :math:`\\tilde{y}_k` is the smoothed label. Usually
+    uniform distribution is used for :math:`\mu`.
+
+    See more details about label smoothing in https://arxiv.org/abs/1512.00567.
+
+    Args:
+        label(Variable): The input variable containing the label data. The
+                          label data should use one-hot representation.
+        prior_dist(Variable): The prior distribution to be used to smooth
+                              labels. If not provided, an uniform distribution
+                              is used. The shape of :attr:`prior_dist` should
+                              be :math:`(1, class\_num)`. 
+        epsilon(float): The weight used to mix up the original ground-truth
+                        distribution and the fixed distribution.
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, 
+                                                  float_64, int etc.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The tensor variable containing the smoothed labels.
+
+    Examples:
+        .. code-block:: python
+
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
+            smooth_label = layers.label_smooth(
+                label=one_hot_label, epsilon=0.1, dtype="float32")
+    """
+    if epsilon > 1. or epsilon < 0.:
+        raise ValueError("The value of epsilon must be between 0 and 1.")
+    helper = LayerHelper("label_smooth", **locals())
+    label.stop_gradient = True
+    smooth_label = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="label_smooth",
+        inputs={"X": label,
+                "PriorDist": prior_dist} if prior_dist else {"X": label},
+        outputs={"Out": smooth_label},
+        attrs={"epsilon": float(epsilon)})
+    return smooth_label
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 99a81c1d4244b919a53dfec36fc5a6659c10adae..c618b02a768f2ca3e2b2914d8ee0134836d5c0d2 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -169,7 +169,7 @@ class Accuracy(MetricBase):
         return self.value / self.weight
 
 
-class ChunkEvalutor(MetricBase):
+class ChunkEvaluator(MetricBase):
     """
     Accumulate counter numbers output by chunk_eval from mini-batches and
     compute the precision recall and F1-score using the accumulated counter
@@ -177,7 +177,7 @@ class ChunkEvalutor(MetricBase):
     """
 
     def __init__(self, name=None):
-        super(ChunkEvalutor, self).__init__(name)
+        super(ChunkEvaluator, self).__init__(name)
         self.num_infer_chunks = 0
         self.num_label_chunks = 0
         self.num_correct_chunks = 0
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index f88a6f1ce6e953c54da29f9e96199169b2cecd8b..a1be2d671ddc5c689b16319fcf5bf12dca5dde7e 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -340,6 +340,16 @@ class TestBook(unittest.TestCase):
             print(layers.lod_reset(x=x, y=y))
         print(str(program))
 
+    def test_label_smooth(self):
+        program = Program()
+        with program_guard(program):
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
+            smooth_label = layers.label_smooth(
+                label=one_hot_label, epsilon=0.1, dtype="float32")
+            self.assertIsNotNone(smooth_label)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py
index 3b059735a924d58714cd88a761eb83143f1192d6..12efdc4a0fec83fed57bdcbf687aaec69d13ba91 100644
--- a/python/paddle/v2/reader/__init__.py
+++ b/python/paddle/v2/reader/__init__.py
@@ -50,7 +50,7 @@ An example implementation for single item data reader creator:
         def reader():
             while True:
                 yield numpy.random.uniform(-1, 1, size=width*height)
-    return reader
+        return reader
 
 An example implementation for multiple item data reader creator:
 
@@ -60,7 +60,7 @@ An example implementation for multiple item data reader creator:
         def reader():
             while True:
                 yield numpy.random.uniform(-1, 1, size=width*height), label
-    return reader
+        return reader
 
 
 TODO(yuyang18): Should we add whole design doc here?
diff --git a/tools/aws_benchmarking/README.md b/tools/aws_benchmarking/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..837fcbb8512bce027ecd09a7f39b806151e9154b
--- /dev/null
+++ b/tools/aws_benchmarking/README.md
@@ -0,0 +1,160 @@
+# AWS benchmark testing tool
+This is an automation tool for deploying paddlepaddle benchmark testing to AWS.
+
+## Features
+
+ - subnet creation to fit just the amount of ec2 instances required.
+ - pserver and trainer ec2 instances allocation, and instance state verification
+ - nvidia-docker ready for GPU training
+ - Instances and network element garbage collection when a task is accomplished or an error occurred
+ - Test log is collected in realtime
+ - Web service for checking log or tearing down the testing setup
+ - No testing code change needed
+ - Lots of optional configuration options
+
+ ## Usages
+
+ ### Prerequisites
+
+ - You have a working AWS account
+ - You have [AWS Command Line Interface](https://aws.amazon.com/cli/) installed
+ - Your AWS cli is bind with a account which has `AmazonEC2FullAccess` permission, and it's set as default credential.
+ - You have key pair created and pem file downloaded.
+ - You have a default VPC in the region you want to run the test.
+ - You have a Security Group created for the VPC mentioned above, which allows port 22 and the port you want to expose your control web service (5436 by default)
+ - If your test is supposed to run in a GPU machine, especially a multi card GPU machine (p2, p3 series), you might need to contact amazon to raise the limit which allows no more than 1 GPU instance at a time.
+
+ ### Start a benchmark test
+
+#### Create training image
+
+*What to expect in this step:*
+
+*You will have your training logic packed with paddle runtime in a docker image, and be able to be picked up by AWS instance for training.*
+
+Training python script and PaddlePaddle runtime are supposed to be packed into one docker image. Use PaddlePaddle production images as base image and create the training images with the docker file as follows:
+
+```Dockerfile
+FROM paddlepaddle/paddle:latest-gpu
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "my_training.py"]
+```
+
+***Please Note***
+Training nodes will run your `ENTRYPOINT` script with the following environment variables:
+
+ - `TASK_NAME`: unique name to identify this training process.
+ - `TRAINING_ROLE`: current node's role in this training process, either "PSERVER" or "TRAINER"
+ - `PSERVER_HOSTS`: comma separated value of pserver end points, I.E. "192.168.1.2:5436,192.168.1.3:5436"
+ - `PSERVERS`: same as above
+ - `TRAINERS`: trainer count
+ - `SERVER_ENDPOINT`: current server end point if the node role is a pserver
+ - `TRAINER_INDEX`: an integer to identify the index of current trainer if the node role is a trainer.
+ - `PADDLE_INIT_TRAINER_ID`: same as above
+
+ Now we have a working distributed training script which takes advantage of node environment variables and docker file to generate the training image. Run the following command:
+
+ ```bash
+ docker build -t myreponname/paddle_benchmark .
+ ```
+
+ Now you have the image built and tagged with `myreponame/paddle_benchmark`, let's push it to dockerhub so that it can be picked up by out AWS instance.
+
+ ```bash
+ docker push myreponame/paddle_benchmark
+ ```
+
+#### Create instances and start training
+
+*What to expect in this step*
+
+*you will be asked to provide some basic settings to config your training, and this tool will have your training started and monitored*
+
+Now let's start the training process:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws -v <full path to your pem file>:/root/<key pare name>.pem \
+putcn/paddle_aws_client \
+--action create \
+--key_name <your key pare name> \
+--security_group_id <your security group id> \
+--docker_image myreponame/paddle_benchmark \
+--pserver_count 2 \
+--trainer_count 2
+```
+
+Now just wait until you see this:
+```
+master server finished init process, visit http://XXX:XXX/status to check master log
+```
+That means you can turn off your laptop and your cluster is creating instances, starting training process, collecting logs and eventually shut all pservers and trainers down when training is finished.
+
+#### Post creation operations
+
+To access the master log:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws \
+putcn/paddle_aws_client \
+--action status \
+--master_server_public_ip <master ip> \
+--master_server_port <master port>
+```
+
+To tear down the training setup:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws \
+putcn/paddle_aws_client \
+--action cleanup \
+--master_server_public_ip <master ip> \
+--master_server_port <master port>
+```
+
+To retrieve training logs
+TBD
+
+### Tech details
+
+*What to expect in this step*
+
+*You will understand what is happening behind the scene, and how to check the training log, how to tear down the training on the fly, etc.*
+
+Let's understand what is happening under the hood when you run above command in your laptop
+
+![alt](diagram.png)
+
+There are 4 roles in the figure above:
+ - client: your laptop
+ - master: who tasks to aws api server to create/tear down instances, and monitor training process
+ - AWS api server: the one who actually creates and manages instances
+ - pservers and trainers: training instances
+
+When you run the `docker run` command above, what it actually does is to ask aws api service to create a subnet (step 1) and a master instance (step 2), and pass all the parameters the client collected or generated (step 3). The master is kept as minimum hardware config to keep the running cost low.
+
+Then when the master is up and running, it will ask the aws api server to create the heavy lifting training instances who are expensive to run (step 4). And the master will start training process as soon as they are done initializing (step 5).
+
+Meanwhile, the master will expose a web service for client to check training log or even tear the training setup down by a web service call.
+
+if you are creating the training with client docker container, and also monitoring your aws dashboard, you will initially see a instance tagged with `ROLE=MASTER` and `TASK_NAME=<yourtask name>_master` starts, then you will see several instances tagged with `ROLE=PSERVER` and `ROLE=TRAINER` starts.
+When the training is finished, pservers and trainers will be terminated. All their logs are kept in master node's docker env.
+
+Master exposes 4 major services:
+
+ - GET `/status`: return master log
+ - GET `/logs`: return list of log file names
+ - GET `/log/<logfile name>`: return a particular log by log file name
+ - POST `/cleanup`: teardown the whole setup
+
+
+### Parameters
+
+TBD, please refer to client/cluster_launcher.py for now
+
+### Trouble shooting
+
+TBD
diff --git a/tools/aws_benchmarking/client/Dockerfile b/tools/aws_benchmarking/client/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..812c5d4bce0adff404577ce6b5fd3f0f4a91118c
--- /dev/null
+++ b/tools/aws_benchmarking/client/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:2.7.14-stretch
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "cluster_launcher.py"]
\ No newline at end of file
diff --git a/tools/aws_benchmarking/client/cluster_launcher.py b/tools/aws_benchmarking/client/cluster_launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..594378ff8fc0744a4b11b1c11e2e3b270be7aed0
--- /dev/null
+++ b/tools/aws_benchmarking/client/cluster_launcher.py
@@ -0,0 +1,407 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+import math
+import logging
+import copy
+
+import netaddr
+import boto3
+import namesgenerator
+import paramiko
+from scp import SCPClient
+import requests
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--key_name', type=str, default="", help="required, key pair name")
+parser.add_argument(
+    '--security_group_id',
+    type=str,
+    default="",
+    help="required, the security group id associated with your VPC")
+
+parser.add_argument(
+    '--vpc_id',
+    type=str,
+    default="",
+    help="The VPC in which you wish to run test")
+parser.add_argument(
+    '--subnet_id',
+    type=str,
+    default="",
+    help="The Subnet_id in which you wish to run test")
+
+parser.add_argument(
+    '--pserver_instance_type',
+    type=str,
+    default="c5.2xlarge",
+    help="your pserver instance type, c5.2xlarge by default")
+parser.add_argument(
+    '--trainer_instance_type',
+    type=str,
+    default="p2.8xlarge",
+    help="your trainer instance type, p2.8xlarge by default")
+
+parser.add_argument(
+    '--task_name',
+    type=str,
+    default="",
+    help="the name you want to identify your job")
+parser.add_argument(
+    '--pserver_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-east-2")
+
+parser.add_argument(
+    '--pserver_command', type=str, default="", help="pserver start command")
+
+parser.add_argument(
+    '--trainer_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-west-2")
+
+parser.add_argument(
+    '--trainer_command', type=str, default="", help="trainer start command")
+
+parser.add_argument(
+    '--availability_zone',
+    type=str,
+    default="us-east-2a",
+    help="aws zone id to place ec2 instances")
+
+parser.add_argument(
+    '--trainer_count', type=int, default=1, help="Trainer count")
+
+parser.add_argument(
+    '--pserver_count', type=int, default=1, help="Pserver count")
+
+parser.add_argument(
+    '--action', type=str, default="create", help="create|cleanup|status")
+
+parser.add_argument('--pem_path', type=str, help="private key file")
+
+parser.add_argument(
+    '--pserver_port', type=str, default="5436", help="pserver port")
+
+parser.add_argument(
+    '--docker_image', type=str, default="busybox", help="training docker image")
+
+parser.add_argument(
+    '--master_server_port', type=int, default=5436, help="master server port")
+
+parser.add_argument(
+    '--master_server_public_ip', type=str, help="master server public ip")
+
+parser.add_argument(
+    '--master_docker_image',
+    type=str,
+    default="putcn/paddle_aws_master:latest",
+    help="master docker image id")
+
+parser.add_argument(
+    '--no_clean_up',
+    type=str2bool,
+    default=False,
+    help="whether to clean up after training")
+
+args = parser.parse_args()
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+
+ec2client = boto3.client('ec2')
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def create_subnet():
+    # if no vpc id provided, list vpcs
+    logging.info("start creating subnet")
+    if not args.vpc_id:
+        logging.info("no vpc provided, trying to find the default one")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "isDefault",
+                "Values": ["true", ]
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No default VPC')
+        args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"]
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+
+        logging.info("default vpc fount with id %s and CidrBlock %s" %
+                     (args.vpc_id, vpc_cidrBlock))
+
+    if not vpc_cidrBlock:
+        logging.info("trying to find cidrblock for vpc")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "vpc-id",
+                "Values": [args.vpc_id, ],
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No VPC found')
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+        logging.info("cidrblock for vpc is %s" % vpc_cidrBlock)
+
+    # list subnets in vpc in order to create a new one
+
+    logging.info("trying to find ip blocks for new subnet")
+    subnets_desc = ec2client.describe_subnets(
+        Filters=[{
+            "Name": "vpc-id",
+            "Values": [args.vpc_id, ],
+        }], )
+
+    ips_taken = []
+    for subnet_dec in subnets_desc["Subnets"]:
+        ips_taken.append(subnet_dec["CidrBlock"])
+
+    ip_blocks_avaliable = netaddr.IPSet(
+        [vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken)
+    # adding 10 addresses as buffer
+    cidr_prefix = 32 - math.ceil(
+        math.log(args.pserver_count + args.trainer_count + 10, 2))
+    if cidr_prefix <= 16:
+        raise ValueError('Too many nodes to fit in current VPC')
+
+    for ipnetwork in ip_blocks_avaliable.iter_cidrs():
+        try:
+            subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next()
+            logging.info("subnet ip block found %s" % (subnet_cidr))
+            break
+        except Exception:
+            pass
+
+    if not subnet_cidr:
+        raise ValueError(
+            'No avaliable subnet to fit required nodes in current VPC')
+
+    logging.info("trying to create subnet")
+    subnet_desc = ec2client.create_subnet(
+        CidrBlock=str(subnet_cidr),
+        VpcId=args.vpc_id,
+        AvailabilityZone=args.availability_zone)
+
+    subnet_id = subnet_desc["Subnet"]["SubnetId"]
+
+    subnet_waiter = ec2client.get_waiter('subnet_available')
+    # sleep for 1s before checking its state
+    time.sleep(1)
+    subnet_waiter.wait(SubnetIds=[subnet_id, ])
+
+    logging.info("subnet created")
+
+    logging.info("adding tags to newly created subnet")
+    ec2client.create_tags(
+        Resources=[subnet_id, ],
+        Tags=[{
+            "Key": "Task_name",
+            'Value': args.task_name
+        }])
+    return subnet_id
+
+
+def run_instances(image_id, instance_type, count=1, role="MASTER", cmd=""):
+    response = ec2client.run_instances(
+        ImageId=image_id,
+        InstanceType=instance_type,
+        MaxCount=count,
+        MinCount=count,
+        UserData=cmd,
+        DryRun=False,
+        InstanceInitiatedShutdownBehavior="stop",
+        KeyName=args.key_name,
+        Placement={'AvailabilityZone': args.availability_zone},
+        NetworkInterfaces=[{
+            'DeviceIndex': 0,
+            'SubnetId': args.subnet_id,
+            "AssociatePublicIpAddress": True,
+            'Groups': args.security_group_ids
+        }],
+        TagSpecifications=[{
+            'ResourceType': "instance",
+            'Tags': [{
+                "Key": 'Task_name',
+                "Value": args.task_name + "_master"
+            }, {
+                "Key": 'Role',
+                "Value": role
+            }]
+        }])
+
+    instance_ids = []
+    for instance in response["Instances"]:
+        instance_ids.append(instance["InstanceId"])
+
+    if len(instance_ids) > 0:
+        logging.info(str(len(instance_ids)) + " instance(s) created")
+    else:
+        logging.info("no instance created")
+    #create waiter to make sure it's running
+
+    logging.info("waiting for instance to become accessible")
+    waiter = ec2client.get_waiter('instance_status_ok')
+    waiter.wait(
+        Filters=[{
+            "Name": "instance-status.status",
+            "Values": ["ok"]
+        }, {
+            "Name": "instance-status.reachability",
+            "Values": ["passed"]
+        }, {
+            "Name": "instance-state-name",
+            "Values": ["running"]
+        }],
+        InstanceIds=instance_ids)
+
+    instances_response = ec2client.describe_instances(InstanceIds=instance_ids)
+
+    return instances_response["Reservations"][0]["Instances"]
+
+
+def generate_task_name():
+    return namesgenerator.get_random_name()
+
+
+def init_args():
+
+    if not args.task_name:
+        args.task_name = generate_task_name()
+        logging.info("task name generated %s" % (args.task_name))
+
+    if not args.pem_path:
+        args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem"
+    if args.security_group_id:
+        args.security_group_ids = (args.security_group_id, )
+
+
+def create():
+
+    init_args()
+
+    # create subnet
+    if not args.subnet_id:
+        args.subnet_id = create_subnet()
+
+    # create master node
+
+    master_instance_response = run_instances(
+        image_id="ami-7a05351f", instance_type="t2.nano")
+
+    logging.info("master server started")
+
+    args.master_server_public_ip = master_instance_response[0][
+        "PublicIpAddress"]
+    args.master_server_ip = master_instance_response[0]["PrivateIpAddress"]
+
+    logging.info("master server started, master_ip=%s, task_name=%s" %
+                 (args.master_server_public_ip, args.task_name))
+
+    # cp config file and pems to master node
+
+    ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+    ssh_client = paramiko.SSHClient()
+    ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    ssh_client.connect(
+        hostname=args.master_server_public_ip, username="ubuntu", pkey=ssh_key)
+
+    with SCPClient(ssh_client.get_transport()) as scp:
+        scp.put(os.path.expanduser("~") + "/" + ".aws",
+                recursive=True,
+                remote_path='/home/ubuntu/')
+        scp.put(args.pem_path,
+                remote_path='/home/ubuntu/' + args.key_name + ".pem")
+
+    logging.info("credentials and pem copied to master")
+
+    # set arguments and start docker
+    kick_off_cmd = "docker run -d -v /home/ubuntu/.aws:/root/.aws/"
+    kick_off_cmd += " -v /home/ubuntu/" + args.key_name + ".pem:/root/" + args.key_name + ".pem"
+    kick_off_cmd += " -v /home/ubuntu/logs/:/root/logs/"
+    kick_off_cmd += " -p " + str(args.master_server_port) + ":" + str(
+        args.master_server_port)
+    kick_off_cmd += " " + args.master_docker_image
+
+    args_to_pass = copy.copy(args)
+    args_to_pass.action = "serve"
+    del args_to_pass.pem_path
+    del args_to_pass.security_group_ids
+    del args_to_pass.master_docker_image
+    del args_to_pass.master_server_public_ip
+    for arg, value in sorted(vars(args_to_pass).iteritems()):
+        if value:
+            kick_off_cmd += ' --%s %s' % (arg, value)
+
+    logging.info(kick_off_cmd)
+    stdin, stdout, stderr = ssh_client.exec_command(command=kick_off_cmd)
+    return_code = stdout.channel.recv_exit_status()
+    logging.info(return_code)
+    if return_code != 0:
+        raise Exception("Error while kicking off master")
+
+    logging.info(
+        "master server finished init process, visit %s to check master log" %
+        (get_master_web_url("/status")))
+
+
+def cleanup():
+    print requests.post(get_master_web_url("/cleanup")).text
+
+
+def status():
+    print requests.post(get_master_web_url("/status")).text
+
+
+def get_master_web_url(path):
+    return "http://" + args.master_server_public_ip + ":" + str(
+        args.master_server_port) + path
+
+
+if __name__ == "__main__":
+    print_arguments()
+    if args.action == "create":
+        if not args.key_name or not args.security_group_id:
+            raise ValueError("key_name and security_group_id are required")
+        create()
+    elif args.action == "cleanup":
+        if not args.master_server_public_ip:
+            raise ValueError("master_server_public_ip is required")
+        cleanup()
+    elif args.action == "status":
+        if not args.master_server_public_ip:
+            raise ValueError("master_server_public_ip is required")
+        status()
diff --git a/tools/aws_benchmarking/client/requirements.txt b/tools/aws_benchmarking/client/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9454801f2025671cfd1a2c3b71cf4c2ac07cb8fb
--- /dev/null
+++ b/tools/aws_benchmarking/client/requirements.txt
@@ -0,0 +1,6 @@
+netaddr==0.7.19
+boto3==1.6.21
+namesgenerator==0.3
+paramiko==2.4.1
+scp
+requests
diff --git a/tools/aws_benchmarking/diagram.png b/tools/aws_benchmarking/diagram.png
new file mode 100644
index 0000000000000000000000000000000000000000..b97909c5fe78b59d0e636ff73c2ed3e63a0be722
Binary files /dev/null and b/tools/aws_benchmarking/diagram.png differ
diff --git a/tools/aws_benchmarking/server/Dockerfile b/tools/aws_benchmarking/server/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..333523abcdb6fbe7dc01bbaf7d32ce1d8e866028
--- /dev/null
+++ b/tools/aws_benchmarking/server/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:2.7.14-stretch
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "cluster_master.py"]
\ No newline at end of file
diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py
new file mode 100644
index 0000000000000000000000000000000000000000..21f85a5fc43e951897eb6b785367630abda722c0
--- /dev/null
+++ b/tools/aws_benchmarking/server/cluster_master.py
@@ -0,0 +1,673 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import json
+import math
+import time
+import threading
+import logging
+
+import netaddr
+import boto3
+import namesgenerator
+import paramiko
+
+from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
+
+
+# You must have aws_access_key_id, aws_secret_access_key, region set in
+# ~/.aws/credentials and ~/.aws/config
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--key_name', type=str, default="", help="required, key pair name")
+parser.add_argument(
+    '--security_group_id',
+    type=str,
+    default="",
+    help="required, the security group id associated with your VPC")
+
+parser.add_argument(
+    '--vpc_id',
+    type=str,
+    default="",
+    help="The VPC in which you wish to run test")
+parser.add_argument(
+    '--subnet_id',
+    type=str,
+    default="",
+    help="The Subnet_id in which you wish to run test")
+
+parser.add_argument(
+    '--pserver_instance_type',
+    type=str,
+    default="c5.2xlarge",
+    help="your pserver instance type, c5.2xlarge by default")
+parser.add_argument(
+    '--trainer_instance_type',
+    type=str,
+    default="p2.8xlarge",
+    help="your trainer instance type, p2.8xlarge by default")
+
+parser.add_argument(
+    '--task_name',
+    type=str,
+    default="",
+    help="the name you want to identify your job")
+parser.add_argument(
+    '--pserver_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, use ami-1ae93962 for us-east-2"
+)
+parser.add_argument(
+    '--trainer_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, use ami-1ae93962 for us-west-2"
+)
+
+parser.add_argument(
+    '--availability_zone',
+    type=str,
+    default="us-east-2a",
+    help="aws zone id to place ec2 instances")
+
+parser.add_argument(
+    '--trainer_count', type=int, default=1, help="Trainer count")
+
+parser.add_argument(
+    '--pserver_count', type=int, default=1, help="Pserver count")
+
+parser.add_argument(
+    '--pserver_bash_file',
+    type=str,
+    default=os.path.join(os.path.dirname(__file__), "pserver.sh.template"),
+    help="pserver bash file path")
+
+parser.add_argument(
+    '--pserver_command', type=str, default="", help="pserver start command")
+
+parser.add_argument(
+    '--trainer_bash_file',
+    type=str,
+    default=os.path.join(os.path.dirname(__file__), "trainer.sh.template"),
+    help="trainer bash file path")
+
+parser.add_argument(
+    '--trainer_command', type=str, default="", help="trainer start command")
+
+parser.add_argument(
+    '--action', type=str, default="serve", help="create|cleanup|serve")
+
+parser.add_argument('--pem_path', type=str, help="private key file")
+
+parser.add_argument(
+    '--pserver_port', type=str, default="5436", help="pserver port")
+
+parser.add_argument(
+    '--docker_image', type=str, default="busybox", help="training docker image")
+
+parser.add_argument(
+    '--master_server_port', type=int, default=5436, help="master server port")
+
+parser.add_argument(
+    '--master_server_ip', type=str, default="", help="master server private ip")
+
+parser.add_argument(
+    '--no_clean_up',
+    type=str2bool,
+    default=False,
+    help="whether to clean up after training")
+
+args = parser.parse_args()
+
+ec2client = boto3.client('ec2')
+
+args.log_path = os.path.join(os.path.dirname(__file__), "logs/")
+
+logging.basicConfig(
+    filename=args.log_path + 'master.log',
+    level=logging.INFO,
+    format='%(asctime)s %(message)s')
+
+log_files = ["master.log"]
+
+
+def create_subnet():
+    # if no vpc id provided, list vpcs
+    logging.info("start creating subnet")
+    if not args.vpc_id:
+        logging.info("no vpc provided, trying to find the default one")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "isDefault",
+                "Values": ["true", ]
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No default VPC')
+        args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"]
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+
+        logging.info("default vpc fount with id %s and CidrBlock %s" %
+                     (args.vpc_id, vpc_cidrBlock))
+
+    if not vpc_cidrBlock:
+        logging.info("trying to find cidrblock for vpc")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "vpc-id",
+                "Values": [args.vpc_id, ],
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No VPC found')
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+        logging.info("cidrblock for vpc is %s" % vpc_cidrBlock)
+
+    # list subnets in vpc in order to create a new one
+
+    logging.info("trying to find ip blocks for new subnet")
+    subnets_desc = ec2client.describe_subnets(
+        Filters=[{
+            "Name": "vpc-id",
+            "Values": [args.vpc_id, ],
+        }], )
+
+    ips_taken = []
+    for subnet_dec in subnets_desc["Subnets"]:
+        ips_taken.append(subnet_dec["CidrBlock"])
+
+    ip_blocks_avaliable = netaddr.IPSet(
+        [vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken)
+    # adding 10 addresses as buffer
+    cidr_prefix = 32 - math.ceil(
+        math.log(args.pserver_count + args.trainer_count + 10, 2))
+    if cidr_prefix <= 16:
+        raise ValueError('Too many nodes to fit in current VPC')
+
+    for ipnetwork in ip_blocks_avaliable.iter_cidrs():
+        try:
+            subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next()
+            logging.info("subnet ip block found %s" % (subnet_cidr))
+            break
+        except Exception:
+            pass
+
+    if not subnet_cidr:
+        raise ValueError(
+            'No avaliable subnet to fit required nodes in current VPC')
+
+    logging.info("trying to create subnet")
+    subnet_desc = ec2client.create_subnet(
+        CidrBlock=str(subnet_cidr),
+        VpcId=args.vpc_id,
+        AvailabilityZone=args.availability_zone)
+
+    subnet_id = subnet_desc["Subnet"]["SubnetId"]
+
+    subnet_waiter = ec2client.get_waiter('subnet_available')
+    # sleep for 1s before checking its state
+    time.sleep(1)
+    subnet_waiter.wait(SubnetIds=[subnet_id, ])
+
+    logging.info("subnet created")
+
+    logging.info("adding tags to newly created subnet")
+    ec2client.create_tags(
+        Resources=[subnet_id, ],
+        Tags=[{
+            "Key": "Task_name",
+            'Value': args.task_name
+        }])
+    return subnet_id
+
+
+def generate_task_name():
+    return namesgenerator.get_random_name()
+
+
+def script_to_str(file_path):
+    if not file_path:
+        return "echo $PSERVER_HOSTS"
+    file = open(file_path, 'r')
+    text = file.read().strip()
+    file.close()
+    return text
+
+
+def run_instances(image_id, instance_type, count, role, cmd=""):
+    response = ec2client.run_instances(
+        ImageId=image_id,
+        InstanceType=instance_type,
+        MaxCount=count,
+        MinCount=count,
+        UserData=cmd,
+        DryRun=False,
+        InstanceInitiatedShutdownBehavior="stop",
+        KeyName=args.key_name,
+        Placement={'AvailabilityZone': args.availability_zone},
+        NetworkInterfaces=[{
+            'DeviceIndex': 0,
+            'SubnetId': args.subnet_id,
+            "AssociatePublicIpAddress": True,
+            'Groups': args.security_group_ids
+        }],
+        TagSpecifications=[{
+            'ResourceType': "instance",
+            'Tags': [{
+                "Key": 'Task_name',
+                "Value": args.task_name
+            }, {
+                "Key": 'Role',
+                "Value": role
+            }]
+        }])
+
+    instance_ids = []
+    for instance in response["Instances"]:
+        instance_ids.append(instance["InstanceId"])
+
+    if len(instance_ids) > 0:
+        logging.info(str(len(instance_ids)) + " instance(s) created")
+    else:
+        logging.info("no instance created")
+    #create waiter to make sure it's running
+
+    logging.info("waiting for instance to become accessible")
+    waiter = ec2client.get_waiter('instance_status_ok')
+    waiter.wait(
+        Filters=[{
+            "Name": "instance-status.status",
+            "Values": ["ok"]
+        }, {
+            "Name": "instance-status.reachability",
+            "Values": ["passed"]
+        }, {
+            "Name": "instance-state-name",
+            "Values": ["running"]
+        }],
+        InstanceIds=instance_ids)
+
+    instances_response = ec2client.describe_instances(InstanceIds=instance_ids)
+
+    return instances_response["Reservations"][0]["Instances"]
+
+
+def create_pservers():
+    try:
+        return run_instances(
+            image_id=args.pserver_image_id,
+            instance_type=args.pserver_instance_type,
+            count=args.pserver_count,
+            role="PSERVER", )
+    except Exception:
+        logging.exception("error while trying to create pservers")
+        cleanup(args.task_name)
+
+
+def log_to_file(source, filename):
+    if not filename in log_files:
+        log_files.append(filename)
+    with open(args.log_path + filename, "a") as log_file:
+        for line in iter(source.readline, ""):
+            log_file.write(line)
+
+
+def create_trainers(kickoff_cmd, pserver_endpoints_str):
+    def create_and_start_trainer(trainer_index):
+        logging.info("trainer " + str(trainer_index) + " is starting")
+
+        instance_response = run_instances(
+            image_id=args.trainer_image_id,
+            instance_type=args.trainer_instance_type,
+            count=1,
+            role="TRAINER", )[0]
+        trainer_ip = instance_response["PrivateIpAddress"]
+
+        logging.info("trainer " + str(trainer_index) + " started")
+
+        ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+        ssh_client = paramiko.SSHClient()
+        ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        ssh_client.connect(hostname=trainer_ip, username="ubuntu", pkey=ssh_key)
+
+        logging.info("trainer " + str(trainer_index) +
+                     " terminal connected via ssh")
+
+        cmd = kickoff_cmd.format(
+            PSERVER_HOSTS=pserver_endpoints_str,
+            DOCKER_IMAGE=args.docker_image,
+            TRAINER_INDEX=str(trainer_index),
+            TASK_NAME=args.task_name,
+            TRAINER_COUNT=args.trainer_count,
+            COMMAND=args.trainer_command,
+            MASTER_ENDPOINT=args.master_server_ip + ":" +
+            str(args.master_server_port))
+        logging.info(cmd)
+
+        stdin, stdout, stderr = ssh_client.exec_command(command=cmd)
+
+        # read and save output log
+
+        logging.info("trainer " + str(trainer_index) +
+                     " command executed, keep fetching log")
+
+        stdout_thread = threading.Thread(
+            target=log_to_file,
+            args=(
+                stdout,
+                "trainer_" + str(trainer_index) + ".log", ))
+        stderr_thread = threading.Thread(
+            target=log_to_file,
+            args=(
+                stderr,
+                "trainer_" + str(trainer_index) + "_err.log", ))
+        stdout_thread.start()
+        stderr_thread.start()
+
+        stdout_thread.join()
+        stderr_thread.join()
+
+        return_code = stdout.channel.recv_exit_status()
+        if return_code != 0:
+            trainer_create_results[trainer_index] = {'has_error': True}
+            raise ValueError("trainer didn't finish with exit code 0")
+
+        ssh_client.close()
+
+    # multi thread starting trainer instance and run kickoff command
+
+    trainer_threads = []
+    trainer_create_results = {}
+    try:
+        for i in xrange(args.trainer_count):
+            logging.info("starting tread for trainer " + str(i))
+            trainer_thread = threading.Thread(
+                target=create_and_start_trainer, args=(i, ))
+            trainer_thread.start()
+            trainer_threads.append(trainer_thread)
+
+        for trainer_thread in trainer_threads:
+            trainer_thread.join()
+
+        for result in trainer_create_results:
+            if result["has_error"]:
+                logging.error(
+                    "error during trainer starting or training, destorying the while cluster "
+                )
+                cleanup(args.task_name)
+                break
+
+        logging.info("all trainers stopped")
+    except Exception, e:
+        logging.info(
+            "Training exception, clean up resources, please check log for more info"
+        )
+    finally:
+        cleanup(args.task_name)
+
+
+def cleanup(task_name):
+    if args.no_clean_up:
+        logging.info("no clean up option set, going to leave the setup running")
+        return
+    #shutdown all ec2 instances
+    print("going to clean up " + task_name + " instances")
+    instances_response = ec2client.describe_instances(Filters=[{
+        "Name": "tag:Task_name",
+        "Values": [task_name]
+    }])
+
+    instance_ids = []
+    if len(instances_response["Reservations"]) > 0:
+        for reservation in instances_response["Reservations"]:
+            for instance in reservation["Instances"]:
+                instance_ids.append(instance["InstanceId"])
+
+        ec2client.terminate_instances(InstanceIds=instance_ids)
+
+        instance_termination_waiter = ec2client.get_waiter(
+            'instance_terminated')
+        instance_termination_waiter.wait(InstanceIds=instance_ids)
+
+    #delete the subnet created
+
+    subnet = ec2client.describe_subnets(Filters=[{
+        "Name": "tag:Task_name",
+        "Values": [task_name]
+    }])
+
+    if len(subnet["Subnets"]) > 0:
+        ec2client.delete_subnet(SubnetId=subnet["Subnets"][0]["SubnetId"])
+    # no subnet delete waiter, just leave it.
+    logging.info("Clearnup done")
+    return
+
+
+def kickoff_pserver(host, pserver_endpoints_str):
+    try:
+        ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+        ssh_client = paramiko.SSHClient()
+        ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        ssh_client.connect(hostname=host, username="ubuntu", pkey=ssh_key)
+        cmd = (script_to_str(args.pserver_bash_file)).format(
+            PSERVER_HOSTS=pserver_endpoints_str,
+            DOCKER_IMAGE=args.docker_image,
+            PSERVER_PORT=args.pserver_port,
+            TASK_NAME=args.task_name,
+            COMMAND=args.pserver_command,
+            TRAINER_COUNT=args.trainer_count,
+            TRAINER_INDEX=0,
+            # there is no way to use 0.0.0.0:port to start pserver
+            # has to docker --network="host" with host ip to make this work
+            SERVER_ENDPOINT=host + ":" + str(args.pserver_port),
+            MASTER_ENDPOINT=args.master_server_ip + ":" +
+            str(args.master_server_port))
+        logging.info(cmd)
+        stdin, stdout, stderr = ssh_client.exec_command(command=cmd)
+
+        stdout_thread = threading.Thread(
+            target=log_to_file, args=(
+                stdout,
+                "pserver_" + host + ".log", ))
+        stderr_thread = threading.Thread(
+            target=log_to_file, args=(
+                stderr,
+                "pserver_" + host + "_err.log", ))
+        stdout_thread.start()
+        stderr_thread.start()
+
+        stdout_thread.join()
+        stderr_thread.join()
+
+        return_code = stdout.channel.recv_exit_status()
+        logging.info(return_code)
+        if return_code != 0:
+            raise Exception("Error while kicking off pserver training process")
+    except Exception:
+        logging.exception("Error while kicking off pserver training process")
+        cleanup(args.task_name)
+    finally:
+        ssh_client.close()
+
+
+def init_args():
+
+    if not args.task_name:
+        args.task_name = generate_task_name()
+        logging.info("task name generated %s" % (args.task_name))
+
+    if not args.pem_path:
+        args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem"
+    if args.security_group_id:
+        args.security_group_ids = (args.security_group_id, )
+
+    args.trainers_job_done_count = 0
+
+
+def create_cluster():
+
+    if not args.subnet_id:
+        logging.info("creating subnet for this task")
+        args.subnet_id = create_subnet()
+        logging.info("subnet %s created" % (args.subnet_id))
+
+    logging.info("creating pservers")
+    pserver_create_response = create_pservers()
+    logging.info("pserver created, collecting pserver ips")
+
+    pserver_endpoints = []
+    for pserver in pserver_create_response:
+        pserver_endpoints.append(pserver["NetworkInterfaces"][0][
+            "PrivateIpAddress"] + ":" + args.pserver_port)
+
+    pserver_endpoints_str = ",".join(pserver_endpoints)
+
+    logging.info("kicking off pserver training process")
+    pserver_threads = []
+    for pserver in pserver_create_response:
+        pserver_thread = threading.Thread(
+            target=kickoff_pserver,
+            args=(pserver["PrivateIpAddress"], pserver_endpoints_str))
+        pserver_thread.start()
+        pserver_threads.append(pserver_thread)
+
+    logging.info("all pserver training process started")
+
+    logging.info("creating trainers and kicking off trainer training process")
+    create_trainers(
+        kickoff_cmd=script_to_str(args.trainer_bash_file),
+        pserver_endpoints_str=pserver_endpoints_str)
+
+    for pserver_thread in pserver_threads:
+        pserver_thread.join()
+
+    logging.info("all process ended")
+
+
+def start_server(args):
+    class S(BaseHTTPRequestHandler):
+        def _set_headers(self):
+            self.send_response(200)
+            self.send_header('Content-type', 'text/text')
+            self.end_headers()
+
+        def do_HEAD(self):
+            self._set_headers()
+
+        def do_404(self):
+            self.send_response(404)
+            self.send_header('Content-type', 'text/text')
+            self.end_headers()
+            logging.info("Received invalid GET request" + self.path)
+            self.wfile.write("NO ACTION FOUND")
+
+        def do_GET(self):
+
+            request_path = self.path
+            if request_path == "/status" or request_path == "/master_logs":
+                self._set_headers()
+                logging.info("Received request to return status")
+                with open(args.log_path + "master.log", "r") as logfile:
+                    self.wfile.write(logfile.read().strip())
+            elif request_path == "/list_logs" or request_path == "/logs":
+                self._set_headers()
+                self.wfile.write("\n".join(log_files))
+            elif "/log/" in request_path:
+                self._set_headers()
+                log_file_path = request_path.replace("/log/", "")
+                logging.info("requesting log file path is" + args.log_path +
+                             log_file_path)
+                with open(args.log_path + log_file_path, "r") as logfile:
+                    self.wfile.write(logfile.read().strip())
+            else:
+                self.do_404()
+
+        def do_POST(self):
+
+            request_path = self.path
+
+            if request_path == "/save_data":
+                self._set_headers()
+                logging.info("Received request to save data")
+                self.wfile.write("DATA SAVED!")
+                content_length = int(self.headers['Content-Length'])
+                post_data = self.rfile.read(content_length)
+                if args.task_name:
+                    with open(args.task_name + ".txt", "a") as text_file:
+                        text_file.write(post_data + "\n")
+
+            elif request_path == "/cleanup":
+                self._set_headers()
+                logging.info("Received request to cleanup cluster")
+                cleanup(args.task_name)
+                self.wfile.write("cleanup in progress")
+
+            else:
+                self.do_404()
+
+    server_address = ('', args.master_server_port)
+    httpd = HTTPServer(server_address, S)
+    logging.info("HTTP server is starting")
+    httpd.serve_forever()
+
+
+def print_arguments():
+    logging.info('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        logging.info('%s: %s' % (arg, value))
+    logging.info('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    if args.action == "create":
+        logging.info("going to create cluster")
+        if not args.key_name or not args.security_group_id:
+            raise ValueError("key_name and security_group_id are required")
+        init_args()
+        create_cluster()
+    elif args.action == "cleanup":
+        logging.info("going to cleanup cluster")
+        if not args.task_name:
+            raise ValueError("task_name is required")
+        cleanup(args.task_name)
+    elif args.action == "serve":
+        # serve mode
+        if not args.master_server_ip:
+            raise ValueError(
+                "No master server ip set, please run with --action create")
+
+        logging.info("going to start serve and create cluster")
+
+        init_args()
+
+        logging.info("starting server in another thread")
+        server_thread = threading.Thread(target=start_server, args=(args, ))
+        server_thread.start()
+
+        create_cluster()
+        server_thread.join()
+    elif args.action == "test":
+        start_server(args)
diff --git a/tools/aws_benchmarking/server/logs/master.log b/tools/aws_benchmarking/server/logs/master.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/aws_benchmarking/server/pserver.sh.template b/tools/aws_benchmarking/server/pserver.sh.template
new file mode 100644
index 0000000000000000000000000000000000000000..2612856d1e6273fe2642f82e8c616eb9ff24f8a4
--- /dev/null
+++ b/tools/aws_benchmarking/server/pserver.sh.template
@@ -0,0 +1,2 @@
+#!/bin/bash
+docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device CPU
\ No newline at end of file
diff --git a/tools/aws_benchmarking/server/requirements.txt b/tools/aws_benchmarking/server/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5c523854f28b0a6f024fba2b2f344b53ba967a2f
--- /dev/null
+++ b/tools/aws_benchmarking/server/requirements.txt
@@ -0,0 +1,4 @@
+netaddr==0.7.19
+boto3==1.6.21
+namesgenerator==0.3
+paramiko==2.4.1
diff --git a/tools/aws_benchmarking/server/trainer.sh.template b/tools/aws_benchmarking/server/trainer.sh.template
new file mode 100644
index 0000000000000000000000000000000000000000..a4b2876b08cdf05e90e50589f897d74ca5f90443
--- /dev/null
+++ b/tools/aws_benchmarking/server/trainer.sh.template
@@ -0,0 +1,2 @@
+#!/bin/bash 
+nvidia-docker run --network="host" -i  -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}"  -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER"  -e "PSERVER_HOSTS={PSERVER_HOSTS}"  -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND} --device GPU
\ No newline at end of file