Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into improve_pruning

15bf6e05 · zlx · 1a82e7da · 7bce40d7 · 15bf6e05 · 15bf6e05
86 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
+option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -107,6 +108,7 @@ include(configure)          # add paddle env configuration
 include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
+include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient")
 set(EXTERNAL_LIBS
    ${GFLAGS_LIBRARIES}
@@ -126,9 +128,12 @@ endif(WITH_GPU)
 add_subdirectory(proto)
 add_subdirectory(paddle)
-add_subdirectory(go/master/c)
 add_subdirectory(python)
-add_subdirectory(go/pserver/cclient)
+if(WITH_GOLANG)
+    #TODO (add go/master/c back when fixed)
+    add_subdirectory(go/pserver/cclient)
+endif(WITH_GOLANG)
 if(WITH_DOC)
    add_subdirectory(doc)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -40,6 +40,10 @@ if(NOT CMAKE_CROSSCOMPILING)
    endif()
 endif()
+if(NOT WITH_GOLANG)
+    add_definitions(-DPADDLE_WITHOUT_GOLANG)
+endif(NOT WITH_GOLANG)
 if(NOT WITH_GPU)
    add_definitions(-DPADDLE_ONLY_CPU)
    add_definitions(-DHPPL_STUB_FUNC)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -11,11 +11,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
-# To simplify the build process of PaddlePaddle, we defined couple of
+# generic.cmake defines CMakes functions that look like Bazel's
-# fundamental abstractions, e.g., how to build library, binary and
+# building rules (https://bazel.build/).
-# test in C++, CUDA and Go.
+#
 # 
 # -------------------------------------------
 #     C++        CUDA C++       Go
@@ -25,27 +26,131 @@
 # cc_test       nv_test      go_test
 # -------------------------------------------
 # 
-# cmake_parse_arguments can help us to achieve this goal.
+# To build a static library example.a from example.cc using the system
-# https://cmake.org/cmake/help/v3.0/module/CMakeParseArguments.html
+#  compiler (like GCC):
+# 
+#   cc_library(example SRCS example.cc)
+# 
+# To build a static library example.a from multiple source files
+# example{1,2,3}.cc:
+# 
+#   cc_library(example SRCS example1.cc example2.cc example3.cc)
+# 
+# To build a shared library example.so from example.cc:
+# 
+#   cc_library(example SHARED SRCS example.cc)
+# 
+# To build a library using Nvidia's NVCC from .cu file(s), use the nv_
+# prefixed version:
+# 
+#   nv_library(example SRCS example.cu)
+# 
+# To specify that a library new_example.a depends on other libraies:
+# 
+#   cc_library(new_example SRCS new_example.cc DEPS example)
+# 
+# Static libraries can be composed of other static libraries:
+# 
+#   cc_library(composed DEPS dependent1 dependent2 dependent3)
+# 
+# To build an executable binary file from some source files and
+# dependent libraries:
 # 
+#   cc_binary(example SRCS main.cc something.cc DEPS example1 example2)
+# 
+# To build an executable binary file using NVCC, use the nv_ prefixed
+# version:
+# 
+#   nv_binary(example SRCS main.cc something.cu DEPS example1 example2)
+# 
+# To build a unit test binary, which is an executable binary with
+# GoogleTest linked:
+# 
+#   cc_test(example_test SRCS example_test.cc DEPS example)
+# 
+# To build a unit test binary using NVCC, use the nv_ prefixed version:
+# 
+#   nv_test(example_test SRCS example_test.cu DEPS example)
+#
+# It is pretty often that executable and test binaries depend on
+# pre-defined external libaries like glog and gflags defined in
+# /cmake/external/*.cmake:
+#
+#   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
 if(NOT APPLE)
    find_package(Threads REQUIRED)
    link_libraries(${CMAKE_THREAD_LIBS_INIT})
 endif(NOT APPLE)
-# cc_library parses tensor.cc and figures out that target also depend on tensor.h.
+function(merge_static_libs TARGET_NAME)
-# cc_library(tensor
+  set(libs ${ARGN})
-#   SRCS
+  list(REMOVE_DUPLICATES libs)
-#   tensor.cc
-#   DEPS
+  # First get the file names of the libraries to be merged
-#   variant)
+  foreach(lib ${libs})
+    get_target_property(libtype ${lib} TYPE)
+    if(NOT libtype STREQUAL "STATIC_LIBRARY")
+      message(FATAL_ERROR "merge_static_libs can only process static libraries")
+    endif()
+    set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+  endforeach()
+  if(APPLE) # Use OSX's libtool to merge archives
+    add_custom_target(${TARGET_NAME}_archive
+      COMMAND libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${libs}
+      )
+    add_library(${TARGET_NAME} STATIC IMPORTED GLOBAL)
+    set_property(TARGET ${TARGET_NAME} PROPERTY
+      IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a")
+    add_dependencies(${TARGET_NAME} ${TARGET_NAME}_archive)
+	else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+    foreach(lib ${libs})
+      set(objlistfile ${lib}.objlist) # list of objects in the input library
+      set(objdir ${lib}.objdir)
+      add_custom_command(OUTPUT ${objdir}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir})
+      add_custom_command(OUTPUT ${objlistfile}
+        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ../${objlistfile}
+        DEPENDS ${lib} ${objdir}
+        WORKING_DIRECTORY ${objdir})
+      # Empty dummy source file that goes into merged library
+      set(mergebase ${lib}.mergebase.c)
+      add_custom_command(OUTPUT ${mergebase}
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
+        DEPENDS ${objlistfile})
+      list(APPEND mergebases "${mergebase}")
+    endforeach()
+    # We need a target for the output merged library
+    add_library(${TARGET_NAME} STATIC ${mergebases})
+    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
+    foreach(lib ${libs})
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${objlistfile}"
+      WORKING_DIRECTORY ${objdir})
+    endforeach()
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND ${CMAKE_RANLIB} ${outlibfile})
+  endif()
+endfunction(merge_static_libs)
 function(cc_library TARGET_NAME)
-  set(options OPTIONAL)
+  set(options STATIC static SHARED shared)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (${cc_library_OPTIONAL} STREQUAL "SHARED")
+  if (cc_library_SRCS)
+    if (cc_library_SHARED OR cc_library_shared) # build *.so
      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
    else()
      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
@@ -53,14 +158,17 @@ function(cc_library TARGET_NAME)
    if (cc_library_DEPS)
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
    endif()
+  else(cc_library_SRCS)
+    if (cc_library_DEPS)
+      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
+    else()
+      message(FATAL "Please specify source file or library in cc_library.")
+    endif()
+  endif(cc_library_SRCS)
 endfunction(cc_library)
-# cc_binary parses tensor.cc and figures out that target also depend on tensor.h.
-# cc_binary(tensor
-#   SRCS
-#   tensor.cc)
 function(cc_binary TARGET_NAME)
-  set(options OPTIONAL)
+  set(options "")
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -71,13 +179,6 @@ function(cc_binary TARGET_NAME)
  endif()
 endfunction(cc_binary)
-# The dependency to target tensor implies that if any of
-# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
-# cc_test(tensor_test
-#   SRCS
-#   tensor_test.cc
-#   DEPS
-#   tensor)
 function(cc_test TARGET_NAME)
  if(WITH_TESTING)
    set(options "")
@@ -91,21 +192,14 @@ function(cc_test TARGET_NAME)
  endif()
 endfunction(cc_test)
-# Suppose that ops.cu includes global functions that take Tensor as
-# their parameters, so ops depend on tensor. This implies that if
-# any of tensor.{h.cc}, ops.{h,cu} is changed, ops need to be re-built.
-# nv_library(ops
-#   SRCS
-#   ops.cu
-#   DEPS
-#   tensor)
 function(nv_library TARGET_NAME)
  if (WITH_GPU)
-    set(options OPTIONAL)
+    set(options STATIC static SHARED shared)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    if (${nv_library_OPTIONAL} STREQUAL "SHARED")
+    if(nv_library_SRCS)
+      if (nv_library_SHARED OR nv_library_shared) # build *.so
        cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
      else()
          cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
@@ -113,6 +207,13 @@ function(nv_library TARGET_NAME)
      if (nv_library_DEPS)
        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
      endif()
+    else(nv_library_SRCS)
+      if (nv_library_DEPS)
+        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
+      else()
+        message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(nv_library_SRCS)
  endif()
 endfunction(nv_library)
@@ -130,13 +231,6 @@ function(nv_binary TARGET_NAME)
  endif()
 endfunction(nv_binary)
-# The dependency to target tensor implies that if any of
-# ops{.h,.cu,_test.cu} is changed, ops_test need to be re-built.
-# nv_test(ops_test
-#   SRCS
-#   ops_test.cu
-#   DEPS
-#   ops)
 function(nv_test TARGET_NAME)
  if (WITH_GPU AND WITH_TESTING)
    set(options "")

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -84,6 +84,7 @@ function(link_paddle_exe TARGET_NAME)
        paddle_parameter
        paddle_proto
        paddle_cuda
+        paddle_optimizer
        ${EXTERNAL_LIBS}
        ${CMAKE_THREAD_LIBS_INIT}
        ${CMAKE_DL_LIBS}

--- a/go/pserver/cclient/CMakeLists.txt
+++ b/go/pserver/cclient/CMakeLists.txt
@@ -11,13 +11,4 @@ include(flags)
 go_library(paddle_pserver_cclient STATIC)
-if(PROJ_ROOT)
-  add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/trainer/libpaddle_pserver_cclient.a
-          COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_pserver_cclient.h ${PROJ_ROOT}/paddle/trainer/
-          COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_pserver_cclient.a ${PROJ_ROOT}/paddle/trainer/
-          WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-          DEPENDS paddle_pserver_cclient)
-  add_custom_target(paddle_pserver_cclient_lib ALL DEPENDS ${PROJ_ROOT}/paddle/trainer/libpaddle_pserver_cclient.a)
-endif(PROJ_ROOT)
 add_subdirectory(test)
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -8,6 +8,7 @@ add_subdirectory(gserver)
 add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
+add_subdirectory(optimizer)
 add_subdirectory(strings)
 # Do not build go directory until go cmake is working smoothly.
@@ -19,8 +20,8 @@ find_package(Boost QUIET)
 if(Boost_FOUND)
  include_directories(${Boost_INCLUDE_DIRS})
-  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+  add_subdirectory(platform)
-  add_subdirectory(majel)
+  add_subdirectory(framework)
 endif()
 if(WITH_C_API)

--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -16,7 +16,7 @@ set(API_HEADER
    Internal.h)
 add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api gen_proto_cpp paddle_pserver_cclient_lib)
+add_dependencies(paddle_api gen_proto_cpp paddle_trainer_lib)
 INCLUDE(${SWIG_USE_FILE})
 INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -842,7 +842,8 @@ public:
                                               int passCount,
                                               bool useSparseUpdater);
  static ParameterUpdater* createNewRemoteUpdater(
-      OptimizationConfig* config, const std::string pserverSpec);
+      OptimizationConfig* config,
+      const std::string pserverSpec) throw(UnsupportError);
  ~ParameterUpdater();
  /**

--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
+#ifndef PADDLE_WITHOUT_GOLANG
 #include "paddle/trainer/NewRemoteParameterUpdater.h"
+#endif
 #include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"
@@ -30,11 +32,16 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
 }
 ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
-    OptimizationConfig *config, const std::string pserverSpec) {
+    OptimizationConfig *config,
+    const std::string pserverSpec) throw(UnsupportError) {
+#ifndef PADDLE_WITHOUT_GOLANG
  auto updater = new ParameterUpdater();
  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
      config->m->getConfig(), pserverSpec));
  return updater;
+#else
+  throw UnsupportError();
+#endif
 }
 ParameterUpdater *ParameterUpdater::createRemoteUpdater(

--- a/paddle/framework/.clang-format
+++ b/paddle/framework/.clang-format
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
+cc_library(ddim SRCS ddim.cc)
+cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
+nv_test(dim_test SRCS dim_test.cu DEPS ddim)
--- a/paddle/majel/ddim.cc
+++ b/paddle/majel/ddim.cc
-#include "paddle/majel/ddim.h"
+#include "paddle/framework/ddim.h"
-namespace majel {
+namespace paddle {
+namespace framework {
 ///@cond HIDDEN
@@ -66,7 +67,7 @@ DDim make_ddim(const std::vector<int>& dims) {
 ///@cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
-public:
+ public:
  DynamicMutableIndexer(int idx) : idx_(idx) {}
  template <int D>
@@ -74,12 +75,12 @@ public:
    return dim[idx_];
  }
-private:
+ private:
  int idx_;
 };
 class DynamicConstIndexer : public boost::static_visitor<int> {
-public:
+ public:
  DynamicConstIndexer(int idx) : idx_(idx) {}
  template <int D>
@@ -87,7 +88,7 @@ public:
    return dim[idx_];
  }
-private:
+ private:
  int idx_;
 };
@@ -213,10 +214,11 @@ struct DDimPrinter : boost::static_visitor<void> {
 ///\endcond
-std::ostream& operator<<(std::ostream& os, const majel::DDim& ddim) {
+std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  DDimPrinter printer(os);
  boost::apply_visitor(printer, ddim);
  return os;
 }
-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/majel/ddim.h
+++ b/paddle/majel/ddim.h
@@ -5,20 +5,14 @@
 #include <stdexcept>
 #include <vector>
-#include "paddle/majel/dim.h"
+#include "paddle/framework/dim.h"
-namespace majel {
+namespace paddle {
+namespace framework {
 namespace {
-typedef boost::variant<Dim<1>,
+typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
-                       Dim<2>,
+                       Dim<8>, Dim<9>>
-                       Dim<3>,
-                       Dim<4>,
-                       Dim<5>,
-                       Dim<6>,
-                       Dim<7>,
-                       Dim<8>,
-                       Dim<9>>
    DDimVar;
 }
@@ -95,14 +89,15 @@ ssize_t product(const DDim& ddim);
 int arity(const DDim& ddim);
-std::ostream& operator<<(std::ostream&, const majel::DDim&);
+std::ostream& operator<<(std::ostream&, const DDim&);
-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle
 namespace boost {
 template <typename T>
-T get(const majel::DDim& in) {
+T get(const paddle::framework::DDim& in) {
  return boost::get<T>(in.var);
 }

--- a/paddle/majel/ddim_test.cc
+++ b/paddle/majel/ddim_test.cc
@@ -4,18 +4,18 @@
 #include <vector>
 #include "gtest/gtest.h"
-#include "paddle/majel/ddim.h"
+#include "paddle/framework/ddim.h"
 TEST(DDim, Equality) {
  // construct a DDim from an initialization list
-  majel::DDim ddim = majel::make_ddim({9, 1, 5});
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5});
  EXPECT_EQ(ddim[0], 9);
  EXPECT_EQ(ddim[1], 1);
  EXPECT_EQ(ddim[2], 5);
  // construct a DDim from a vector
  std::vector<int> vec({9, 1, 5});
-  majel::DDim vddim = majel::make_ddim(vec);
+  paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
  EXPECT_EQ(ddim[0], 9);
  EXPECT_EQ(ddim[1], 1);
  EXPECT_EQ(ddim[2], 5);
@@ -23,43 +23,43 @@ TEST(DDim, Equality) {
  // mutate a DDim
  ddim[1] = 2;
  EXPECT_EQ(ddim[1], 2);
-  majel::set(ddim, 0, 6);
+  paddle::framework::set(ddim, 0, 6);
-  EXPECT_EQ(majel::get(ddim, 0), 6);
+  EXPECT_EQ(paddle::framework::get(ddim, 0), 6);
  // vectorize a DDim
-  std::vector<int> res_vec = majel::vectorize(vddim);
+  std::vector<int> res_vec = paddle::framework::vectorize(vddim);
  EXPECT_EQ(res_vec[0], 9);
  EXPECT_EQ(res_vec[1], 1);
  EXPECT_EQ(res_vec[2], 5);
-  majel::Dim<3> d(3, 2, 1);
+  paddle::framework::Dim<3> d(3, 2, 1);
-  res_vec = majel::vectorize(majel::DDim(d));
+  res_vec = paddle::framework::vectorize(paddle::framework::DDim(d));
  EXPECT_EQ(res_vec[0], 3);
  EXPECT_EQ(res_vec[1], 2);
  EXPECT_EQ(res_vec[2], 1);
  // add two DDims
-  majel::DDim ddim_sum = ddim + vddim;
+  paddle::framework::DDim ddim_sum = ddim + vddim;
  EXPECT_EQ(ddim_sum[0], 15);
  EXPECT_EQ(ddim_sum[1], 3);
  EXPECT_EQ(ddim_sum[2], 10);
  // multiply two DDims
-  majel::DDim ddim_mul = ddim * vddim;
+  paddle::framework::DDim ddim_mul = ddim * vddim;
  EXPECT_EQ(ddim_mul[0], 54);
  EXPECT_EQ(ddim_mul[1], 2);
  EXPECT_EQ(ddim_mul[2], 25);
  // arity of a DDim
-  EXPECT_EQ(majel::arity(ddim), 3);
+  EXPECT_EQ(paddle::framework::arity(ddim), 3);
  // product of a DDim
-  EXPECT_EQ(majel::product(vddim), 45);
+  EXPECT_EQ(paddle::framework::product(vddim), 45);
 }
 TEST(DDim, Print) {
  // print a DDim
  std::stringstream ss;
-  majel::DDim ddim = majel::make_ddim({2, 3, 4});
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4});
  ss << ddim;
  EXPECT_EQ("2, 3, 4", ss.str());
 }
--- a/paddle/majel/dim.h
+++ b/paddle/majel/dim.h
@@ -5,10 +5,11 @@
 #include <stdexcept>
 #include <type_traits>
-#include "paddle/majel/detail/cuda_assert.h"
+#include "paddle/platform/assert.h"
-#include "paddle/majel/detail/hostdevice.h"
+#include "paddle/platform/hostdevice.h"
-namespace majel {
+namespace paddle {
+namespace framework {
 // Statically sized, statically indexed dimension
 template <int i>
@@ -74,7 +75,7 @@ struct Dim<1> {
      throw std::invalid_argument("Index out of range.");
    }
 #else
-    MAJEL_ASSERT(idx < size.head);
+    PADDLE_ASSERT(idx < size.head);
 #endif
  }
@@ -131,7 +132,7 @@ HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
    throw std::invalid_argument("Tried to access a negative dimension");
  }
 #else
-  MAJEL_ASSERT(idx >= 0);
+  PADDLE_ASSERT(idx >= 0);
 #endif
  if (idx == 0) {
    return dim.head;
@@ -146,7 +147,7 @@ HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
    throw std::invalid_argument("Invalid index");
  }
 #else
-  MAJEL_ASSERT(idx == 0);
+  PADDLE_ASSERT(idx == 0);
 #endif
  return dim.head;
 }
@@ -158,7 +159,7 @@ HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
    throw std::invalid_argument("Tried to access a negative dimension");
  }
 #else
-  MAJEL_ASSERT(idx >= 0);
+  PADDLE_ASSERT(idx >= 0);
 #endif
  if (idx == 0) {
    return dim.head;
@@ -173,7 +174,7 @@ HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
    throw std::invalid_argument("Invalid index");
  }
 #else
-  MAJEL_ASSERT(idx == 0);
+  PADDLE_ASSERT(idx == 0);
 #endif
  return dim.head;
 }
@@ -411,7 +412,7 @@ HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
 // XXX For some reason, overloading fails to resolve this correctly
 template <int i>
 typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
-    std::ostream& os, const majel::Dim<i>& d) {
+    std::ostream& os, const Dim<i>& d) {
  os << d.head << ", " << d.tail;
  return os;
 }
@@ -420,7 +421,7 @@ typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
 // XXX I wish this could be an overload instead of a template
 template <int i>
 typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
-    std::ostream& os, const majel::Dim<i>& d) {
+    std::ostream& os, const Dim<i>& d) {
  os << d.head;
  return os;
 }
@@ -448,4 +449,5 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
  return result;
 }
-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
+#include <thrust/device_vector.h>
+#include <sstream>
+#include "paddle/framework/dim.h"
+#include "gtest/gtest.h"
+__global__ void test(paddle::framework::Dim<2>* o) {
+    o[0] = paddle::framework::make_dim(5, 6);
+}
+__global__ void dyn_idx_gpu(int* o) {
+    auto d = paddle::framework::make_dim(5, 6);
+    o[0] = d[1];
+}
+TEST(Dim, Equality) {
+    // construct a Dim on the CPU
+    auto a = paddle::framework::make_dim(3, 4);
+    EXPECT_EQ(paddle::framework::get<0>(a), 3);
+    EXPECT_EQ(paddle::framework::get<1>(a), 4);
+    // construct a Dim on the GPU
+    thrust::device_vector<paddle::framework::Dim<2>> t(2);
+    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
+    a = t[0];
+    EXPECT_EQ(paddle::framework::get<0>(a), 5);
+    EXPECT_EQ(paddle::framework::get<1>(a), 6);
+    // linearization
+    auto b = paddle::framework::make_dim(7, 8);
+    EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+    // product
+    EXPECT_EQ(paddle::framework::product(a), 30);
+    // mutate a Dim
+    paddle::framework::get<1>(b) = 10;
+    EXPECT_EQ(paddle::framework::get<0>(b), 7);
+    EXPECT_EQ(paddle::framework::get<1>(b), 10);
+    // dynamic access
+    paddle::framework::get(b, 0) = 8;
+    b[1] = 11;
+    EXPECT_EQ(paddle::framework::get<0>(b), 8);
+    EXPECT_EQ(paddle::framework::get<1>(b), 11);
+    EXPECT_EQ(paddle::framework::get(b, 0), 8);
+    EXPECT_EQ(b[1], 11);
+    // dynamic access on GPU
+    thrust::device_vector<int> r(1);
+    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
+    int res = r[0];
+    EXPECT_EQ(res, 6);
+    // ex_prefix_mul
+    paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 3);
+    EXPECT_EQ(paddle::framework::get<2>(c), 12);
+    // contiguous_strides
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 1, 10));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 0);
+    EXPECT_EQ(paddle::framework::get<2>(c), 10);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 10, 1));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 10);
+    EXPECT_EQ(paddle::framework::get<2>(c), 0);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(1, 10, 10));
+    EXPECT_EQ(paddle::framework::get<0>(c), 0);
+    EXPECT_EQ(paddle::framework::get<1>(c), 1);
+    EXPECT_EQ(paddle::framework::get<2>(c), 10);
+    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(2, 3, 4));
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 2);
+    EXPECT_EQ(paddle::framework::get<2>(c), 6);
+    // generate from an index
+    auto size = paddle::framework::make_dim(4, 5, 2);
+    c = paddle::framework::Dim<3>(14, size);
+    EXPECT_EQ(paddle::framework::get<0>(c), 2);
+    EXPECT_EQ(paddle::framework::get<1>(c), 3);
+    EXPECT_EQ(paddle::framework::get<2>(c), 0);
+    c = paddle::framework::Dim<3>(25, size);
+    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+    EXPECT_EQ(paddle::framework::get<1>(c), 1);
+    EXPECT_EQ(paddle::framework::get<2>(c), 1);
+}
+TEST(Dim, Bool) {
+    auto a = paddle::framework::make_dim(3, 4);
+    auto b = paddle::framework::make_dim(5, 6);
+    auto c = paddle::framework::make_dim(3, 4);
+    // in_bounds check
+    EXPECT_TRUE(paddle::framework::contained(a, b));
+    EXPECT_FALSE(paddle::framework::contained(b, a));
+    // comparison
+    EXPECT_TRUE(a == a);
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a == c);
+    // contiguous check
+    int x = 4, y = 5, z = 2;
+    paddle::framework::Dim<3> sizef(x, y, z);
+    paddle::framework::Dim<3> stridea(1, x, x*y);
+    paddle::framework::Dim<3> strideb(2, 2*x, 2*x*y);
+    paddle::framework::Dim<3> stridec(1, x, 2*x*y);
+    EXPECT_TRUE(paddle::framework::contiguous(sizef, stridea));
+    EXPECT_FALSE(paddle::framework::contiguous(sizef, strideb));
+    EXPECT_FALSE(paddle::framework::contiguous(sizef, stridec));
+}
+TEST(Dim, Print) {
+    {
+        std::stringstream ss;
+        auto a = paddle::framework::make_dim(2, 3);
+        ss << a;
+        EXPECT_EQ(ss.str(), "2, 3");
+    }
+    {
+        std::stringstream ss;
+        ss << paddle::framework::make_dim(8);
+        EXPECT_EQ(ss.str(), "8");
+    }
+}
--- a/paddle/majel/README.md
+++ b/paddle/majel/README.md
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -68,12 +68,10 @@ public:
    numOutputs_ = 1;
  }
-  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
  // input can be INPUT and INPUT_GRAD
  // filter can be FILTER and FILTER_GRAD
  // output can be OUTPUT and OUTPUT_GRAD
-  void check(const TensorShape& input,
+  void checkShape(const TensorShape& input,
                  const TensorShape& filter,
                  const TensorShape& output) {
    // inputs and outputs arguments should be 4-dimensional.

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -117,15 +117,23 @@ public:
    ConvFunctionBase::init(config);
  }
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(numInputs_, inputs.size());
    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
    // TODO(hedaoyuan): Need to define some index macros,
    // to avoid useing 0 and 1.
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();
-    check(input, filter, output);
    real beta;
    if (outputs[0].getArgType() == ADD_TO) {
@@ -209,16 +217,24 @@ public:
    ConvFunctionBase::init(config);
  }
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(numInputs_, inputs.size());
    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
    // Since the implementation of Col2ImFunctor is ADD_TO,
    // this function only supports ADD_TO mode.
    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
    const TensorShape& output = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& input = outputs[0].shape();
-    check(input, filter, output);
    size_t batchSize = input[0];
    size_t inputChannels = input[1];
@@ -295,13 +311,21 @@ public:
    ConvFunctionBase::init(config);
  }
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(numInputs_, inputs.size());
    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
    const TensorShape& output = inputs[0].shape();
    const TensorShape& input = inputs[1].shape();
    const TensorShape& filter = outputs[0].shape();
-    check(input, filter, output);
    real beta;
    if (outputs[0].getArgType() == ADD_TO) {

--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -54,8 +54,8 @@ public:
                  T inValue;
                  const int inH = inStartH + fH;
                  const int inW = inStartW + fW;
-                  if ((inH >= 0 && inH < inputHeight) &&
+                  if ((inH >= 0 && inH < (int)inputHeight) &&
-                      (inW >= 0 && inW < inputWidth)) {
+                      (inW >= 0 && inW < (int)inputWidth)) {
                    size_t offsetInput =
                        batch * inputChannels * inputHeight * inputWidth +
                        inC * inputHeight * inputWidth + inH * inputWidth + inW;
@@ -90,14 +90,19 @@ public:
    ConvFunctionBase::init(config);
  }
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+  virtual void check(const BufferArgs& inputs,
-    CHECK_EQ(numInputs_, inputs.size());
+                     const BufferArgs& outputs) override {
-    CHECK_EQ(numOutputs_, outputs.size());
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();
-    check(input, filter, output);
+    checkShape(input, filter, output);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    check(inputs, outputs);
    size_t batchSize = inputs[0].shape()[0];
    size_t inputChannels = inputs[0].shape()[1];

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -284,6 +284,16 @@ public:
  }
 protected:
+  std::vector<Argument::SeqInfo> commonSeqInfo_;
+  ICpuGpuVectorPtr sequenceStartPositions_;
+  void calcSequenceStartPositions();
+  void checkInputConsistency(int inlinkId,
+                             const std::vector<Argument::SeqInfo>& seqInfo);
+  void reorganizeInput(PassType passType);
+  void reorganizeOutput(PassType passType);
+  void connectFrames(PassType passType);
+  void calcNumSequencesAtEachStep();
  void resizeOrCreateFrames(int numFrames);
  void resizeBootFrame(int numSequences);
@@ -295,7 +305,6 @@ protected:
    std::string linkName;
    LayerPtr inLayer;
    std::vector<LayerPtr> agents;  // Scatter Agents to reform batch input
-    bool hasSubseq;
    Argument outArg;               // scatter output argument
  };
  std::vector<InFrameLine> inFrameLines_;
@@ -318,7 +327,6 @@ protected:
    std::vector<LayerPtr> agents;
    std::vector<LayerPtr> scatterAgents;  // scatter agent used by beam search
    Argument outArg;                      // scatter output argument
-    bool is_sequence;
    // Different memoryFrameLine have different element as follows
    IVectorPtr allIds;  // scattered id of realLayer
    ICpuGpuVectorPtr
@@ -330,22 +338,27 @@ protected:
  // and all outFrameLines(outlinks) share the info with one inFrameLine,
  // which is assigned by targetInfoInlinkId_.
  struct Info {
-    IVectorPtr allIds;         // scattered id of realLayer
+    // The original positions in the original batch
-    std::vector<int> idIndex;  // index of allIds
+    IVectorPtr allIds;  // scattered id of realLayer [batchSize]
+    // index of allIds for each step [maxSequenceLength_]
+    // idIndex[i] is the total length of the first i sequences
+    std::vector<int> idIndex;
    ICpuGpuVectorPtr
        sequenceStartPositions;         // scattered sequenceStartPositions
    std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
  };
-  std::vector<Info> info_;
+  std::vector<Info> info_;  // for input
  // numSeqs_[i] is the number sequences which is longer than i (for sequence
  // data) or has more than i subsequences (for subsequence data)
+  // Equivalently, numSeqs_[i] is the number of sequences at step i;
  std::vector<int> numSeqs_;
  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
-  // the id of inlink which share info with outlinks
+  void checkOutputConsistency(OutFrameLine& outFrameLine);
-  int targetInfoInlinkId_;
  /* create scattered id infomation for all realLayer of inFrameLines one time.
   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
@@ -354,6 +367,28 @@ protected:
  void createInFrameInfo(int inlinks_id,
                         const Argument& input,
                         PassType passType);
+  void createInFrameInfo_nonseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+  void createInFrameInfo_seq(int inlinks_id,
+                             const Argument& input,
+                             PassType passType);
+  void createInFrameInfo_subseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+  void createOutFrameInfo(OutFrameLine& outFrameLine,
+                          Info& info,
+                          ICpuGpuVectorPtr& sequenceStartPositions,
+                          ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_seq(OutFrameLine& outFrameLine,
+                              Info& info,
+                              ICpuGpuVectorPtr& sequenceStartPositions,
+                              ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_subseq(OutFrameLine& outFrameLine,
+                                 Info& info,
+                                 ICpuGpuVectorPtr& sequenceStartPositions,
+                                 ICpuGpuVectorPtr& subSequenceStartPositions);
  void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
                             PassType passType);
@@ -386,9 +421,7 @@ protected:
  NeuralNetwork* rootNetwork_;
  bool reversed_;
-  // if hasSubseq: max number of sentences(subseq)in batchsize samples
+  int maxSequenceLength_;  // Max top-level length
-  // else: max number of tokens in batchsize samples(sentences)
-  int maxSequenceLength_;
  bool useGpu_;
  bool stopBeamSearch_;

--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -35,36 +35,15 @@ bool AgentLayer::init(const LayerMap& layerMap,
 void AgentLayer::forward(PassType passType) {
  Layer::forward(passType);
-  Argument& realOutput = realLayer_->getOutput();
-  int realHeight = realOutput.getBatchSize();
-  CHECK_LE(numSamples_, realHeight);
-  // get Arguments from real layers
-  if (numSamples_ > 0 && numSamples_ < realHeight) {
-    if (realOutput.ids) {
-      output_.ids =
-          IVector::create(realOutput.ids->getData(), numSamples_, useGpu_);
-    } else {
-      output_.subArgFrom(
-          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
-    }
-  } else {
-    output_ = realOutput;
-  }
-}
-void SequenceAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
  Argument& realOutput = realLayer_->getOutput();
  int realNumSequences = realOutput.getNumSequences();
  CHECK_LE(numSamples_, realNumSequences);
  // get Arguments from real layers
  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
+    if (realOutput.hasSeq()) {
      int numRows =
          realOutput.sequenceStartPositions->getData(false)[numSamples_];
-    CHECK(!realOutput.ids) << "Not supported";
      output_.subArgFrom(realOutput,
                         /* offset */ 0,
                         numRows,
@@ -74,13 +53,15 @@ void SequenceAgentLayer::forward(PassType passType) {
                         /* seqFlag */ true,
                         /* seqStart */ 0,
                         /* seqSize */ numSamples_ + 1);
+    } else {
+      output_.subArgFrom(
+          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
+    }
  } else {
    output_ = realOutput;
  }
 }
-REGISTER_LAYER(sequence_agent, SequenceAgentLayer);
 bool GatherAgentLayer::init(const LayerMap& layerMap,
                            const ParameterMap& parameterMap) {
  CHECK_EQ(config_.inputs_size(), 0);
@@ -91,18 +72,26 @@ bool GatherAgentLayer::init(const LayerMap& layerMap,
  return true;
 }
-void GatherAgentLayer::copyIdAndSequenceInfo(const Argument& input,
+void GatherAgentLayer::copyIdAndSequenceInfo(
+    ICpuGpuVectorPtr sequenceStartPositions,
+    ICpuGpuVectorPtr subSequenceStartPositions,
    const IVectorPtr& ids,
    const std::vector<int>& idIndex) {
-  output_.sequenceStartPositions = input.sequenceStartPositions;
+  output_.sequenceStartPositions = sequenceStartPositions;
-  output_.subSequenceStartPositions = input.subSequenceStartPositions;
+  output_.subSequenceStartPositions = subSequenceStartPositions;
-  realLayers_.clear();
  allIds_ = ids;
  idIndex_ = idIndex;
 }
 void GatherAgentLayer::forward(PassType passType) {
  Layer::forward(passType);
+  forwardIds(passType);
+  forwardValue(passType);
+}
+void GatherAgentLayer::forwardValue(PassType passType) {
+  MatrixPtr valueReal = realLayers_[0]->getOutputValue();
+  if (!valueReal) return;
  int height = allIds_->getSize();
  int width = this->getSize();
@@ -147,7 +136,9 @@ void ScatterAgentLayer::forward(PassType passType) {
  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
  int width = this->getSize();
-  if (realOutArg_.value || realOutArg_.ids) {
+  if (realOutArg_.hasSeq()) {
+    forwardSequence(passType);
+  } else if (realOutArg_.value || realOutArg_.ids) {
    output_.subArgFrom(
        realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
  } else {  // used in generation
@@ -174,7 +165,7 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) {
  if (realGrad) {
    // for agent in inFrameLines and memoryFrameLines,
    // only first scatterAgentLayer should do addToRows in backward
-    if (idIndex_ == 0) {
+    if (handleBackward_) {
      outputGrad->addToRows(*realGrad, *ids_);
    }
  }
@@ -183,12 +174,14 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) {
 REGISTER_LAYER(gather_agent, GatherAgentLayer);
 REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
-void SequenceGatherAgentLayer::forward(PassType passType) {
+void GatherAgentLayer::forwardIds(PassType passType) {
-  Layer::forward(passType);
  int height = 0;
-  int* starts = output_.subSequenceStartPositions->getMutableData(false);
  IVectorPtr idReal = realLayers_[0]->getOutputLabel();
-  if (idReal) {
+  if (!idReal) return;
+  if (output_.subSequenceStartPositions) {
+    int* starts = output_.subSequenceStartPositions->getMutableData(false);
    // Gather generator.idsVec
    // if is beam search generation result. Get first result.
    if (idReal->getData()[idReal->getSize() - 1] == -1) {
@@ -212,13 +205,11 @@ void SequenceGatherAgentLayer::forward(PassType passType) {
          ->copyFrom(*realLayers_[i]->getOutputLabel());
    }
  } else {
-    // Gather output.value, same as GatherAgentLayer
+    LOG(FATAL) << "Not implemented";
-    CHECK(output_.subSequenceStartPositions);
-    GatherAgentLayer::forward(passType);
  }
 }
-void SequenceScatterAgentLayer::forward(PassType passType) {
+void ScatterAgentLayer::forwardSequence(PassType passType) {
  Layer::forward(passType);
  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
@@ -241,6 +232,7 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
                       /* seqStart */ seqStartPosIndex_,
                       /* seqSize */ numSequences_);
  } else {
+    // Putting the generation logic here is really an ugly hack!
    // used in generation
    int height = 0;
    size_t numSequences = ids_->getSize();
@@ -284,7 +276,4 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
  }
 }
-REGISTER_LAYER(sequence_gather_agent, SequenceGatherAgentLayer);
-REGISTER_LAYER(sequence_scatter_agent, SequenceScatterAgentLayer);
 }  // namespace paddle
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -49,18 +49,6 @@ public:
  void backward(const UpdateCallback& callback = nullptr) override {}
 };
-/**
- * like AgentLayer, but use first *numSamples* sequences
- */
-class SequenceAgentLayer : public AgentLayer {
-public:
-  explicit SequenceAgentLayer(const LayerConfig& config) : AgentLayer(config) {}
-  ~SequenceAgentLayer() {}
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
 /**
 * Like AgentLayer, but it can gather many real layers. Each real
 * layer give a few rows of a sequence, after gather all real layers,
@@ -83,7 +71,10 @@ public:
            const ParameterMap& parameterMap) override;
  // call before addRealLayer
-  void copyIdAndSequenceInfo(const Argument& input,
+  void clearRealLayers() { realLayers_.clear(); }
+  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
+                             ICpuGpuVectorPtr subSequenceStartPositions,
                             const IVectorPtr& allIds,
                             const std::vector<int>& idIndex);
@@ -92,24 +83,8 @@ public:
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback) override;
-};
+  void forwardValue(PassType passType);
+  void forwardIds(PassType passType);
-/**
- * Like GatherAgentLayer, but select a few sequence in real layer.
- * *ids* in addRealLayer() are the ids of selected sequence.
- * It's used to reorder sequence output.
- */
-class SequenceGatherAgentLayer : public GatherAgentLayer {
-public:
-  explicit SequenceGatherAgentLayer(const LayerConfig& config)
-      : GatherAgentLayer(config) {}
-  virtual ~SequenceGatherAgentLayer() {}
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {
-    // same as GatherAgentLayer
-    GatherAgentLayer::backward(callback);
-  }
 };
 /**
@@ -129,6 +104,11 @@ protected:
  int idSize_;
  int seqStartPosIndex_;
  int numSequences_;  // number of sequences in this scatterAgentLayer
+  bool handleBackward_;
+  // use to store expanded cpuStartPositions or subSequenceStartPositions
+  // of real layer.
+  ICpuGpuVectorPtr inputStartPos_;
 public:
  explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
@@ -147,13 +127,10 @@ public:
   *                        false(default) in ScatterAgentLayer, and
   *                        true in SequenceScatterAgentLayer.
   */
-  void setRealLayer(LayerPtr layer,
+  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
-                    const std::vector<int>& ids,
-                    bool copyId = false) {
    realLayer_ = layer;
    IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
    ids_->copyFrom(ids.data(), ids.size());
-    if (copyId) {
    if (useGpu_) {
      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
      cpuIds_->copyFrom(ids.data(), ids.size());
@@ -161,7 +138,6 @@ public:
      cpuIds_ = ids_;
    }
  }
-  }
  // set real layer and output, [idIndex, idIndex + idSize) of *ids*
  // are selected row for realOutArg in realLayer
@@ -169,12 +145,14 @@ public:
                             const Argument& outArg,
                             const IVectorPtr& ids,
                             int idIndex,
-                             int idSize) {
+                             int idSize,
+                             bool handleBackward) {
    realLayer_ = layer;
    realOutArg_ = outArg;
    ids_ = ids;
    idIndex_ = idIndex;
    idSize_ = idSize;
+    handleBackward_ = handleBackward;
  }
  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
@@ -187,28 +165,8 @@ public:
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback) override;
-};
-/**
+  void forwardSequence(PassType passType);
- * Like ScatterAgentLayer, but select a few sequence in real layer.
- * *ids* in setRealLayer() or setRealLayerAndOutput() are the ids of
- * selected sequence. It's used to reorder sequence input.
- */
-class SequenceScatterAgentLayer : public ScatterAgentLayer {
-protected:
-  // use to store expanded cpuStartPositions or subSequenceStartPositions
-  // of real layer.
-  ICpuGpuVectorPtr inputStartPos_;
-public:
-  explicit SequenceScatterAgentLayer(const LayerConfig& config)
-      : ScatterAgentLayer(config) {}
-  virtual ~SequenceScatterAgentLayer() {}
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {
-    ScatterAgentLayer::backward(callback);
-  }
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -40,6 +40,7 @@ namespace paddle {
 class FeatureMapExpandLayer : public Layer {
 private:
  int numFilters_;
+  bool asRowVector_;
 public:
  explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
@@ -62,6 +63,7 @@ bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
  CHECK_EQ(inputLayers_.size(), 1UL);
  numFilters_ = config_.num_filters();
+  asRowVector_ = config_.user_arg() != "as_col_vec";
  return true;
 }
@@ -76,6 +78,7 @@ void FeatureMapExpandLayer::forward(PassType passType) {
  {
    AsyncGpuBlock asyncGpuBlock;
+    if (asRowVector_) {
      for (size_t i = 0; i < batchSize; i++) {
        MatrixPtr outVTmp =
            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
@@ -87,6 +90,19 @@ void FeatureMapExpandLayer::forward(PassType passType) {
            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
        outVTmp->addRowVector(*inVTmp);
      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        outVTmp->addColVector(*inVTmp);
+      }
+    }
  }
  /* activation */ {
    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
@@ -102,8 +118,13 @@ void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
  MatrixPtr outGrad = getOutputGrad();
  size_t batchSize = getInput(0).getBatchSize();
  int imgSize = inGrad->getWidth();
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
  {
    AsyncGpuBlock asyncGpuBlock;
+    if (asRowVector_) {
      for (size_t i = 0; i < batchSize; i++) {
        MatrixPtr outGradTmp =
            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
@@ -115,10 +136,19 @@ void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
        inGradTmp->collectBias(*outGradTmp, 1);
      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        inGradTmp->sumRows(*outGradTmp, 1, 1);
+      }
    }
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
  }
 }

--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -22,10 +22,33 @@ public:
  void forward(PassType passType) override {
    Layer::forward(passType);
+    std::vector<std::string> vals;
    for (size_t i = 0; i != inputLayers_.size(); ++i) {
-      getInput(i).printValueString(LOG(INFO),
+      std::ostringstream s;
-                                   "layer=" + inputLayers_[i]->getName() + " ");
+      getInput(i).printValueString(s, "");
+      vals.push_back(s.str());
    }
+    size_t pos = 0;
+    int i = 0;
+    std::ostringstream s;
+    const std::string& format = config_.user_arg();
+    while (true) {
+      size_t pos1 = format.find("%s", pos);
+      if (pos1 == std::string::npos) break;
+      if (i >= vals.size()) {
+        break;
+      }
+      s << format.substr(pos, pos1 - pos) << vals[i];
+      pos = pos1 + 2;
+      ++i;
+    }
+    if (i != inputLayers_.size()) {
+      LOG(ERROR) << "Number of value in the format (" << format
+                 << ") is not same as the number of inputs ("
+                 << inputLayers_.size() << ") at " << getName();
+    }
+    s << format.substr(pos);
+    LOG(INFO) << s.str();
  }
  void backward(const UpdateCallback& callback) override {}

--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -46,6 +46,9 @@ void SequencePoolLayer::forward(PassType passType) {
  Layer::forward(passType);
  const Argument& input = getInput(0);
+  CHECK(input.hasSeq() || input.hasSubseq())
+      << "Input should be a sequence or subsequence for layer " << getName();
  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
  size_t dim = getSize();
  // check

--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -95,3 +95,22 @@ def process_unequalength_seq(settings, file_name):
        words1 = reduce(lambda x, y: x + y, d[0])
        words2 = reduce(lambda x, y: x + y, d[1])
        yield words1, words2, d[2]
+###########################################################
+data3 = [
+    [[[1, 2], [4, 5, 2]], [1, 2], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], [2, 3, 0], 1],
+]
+# Used for sequence_nest_mixed_inputs.conf
+@provider(
+    input_types=[
+        integer_value_sub_sequence(10), integer_value_sequence(10),
+        integer_value(2)
+    ],
+    should_shuffle=False)
+def process_mixed(settings, file_name):
+    for d in data3:
+        yield d
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                        test_list=None,
                        module='rnn_data_provider',
-                        obj='process_subseq2')
+                        obj='process_subseq')
 settings(batch_size=2, learning_rate=0.01)

--- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+# edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+######################## data source ################################
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+encoding = embedding_layer(input=data2, size=word_dim)
+subseq = embedding_layer(input=data1, size=word_dim)
+seq = embedding_layer(input=data2, size=word_dim)
+nonseq = embedding_layer(input=label, size=word_dim)
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_multi_unequalength_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(subseq, seq, nonseq):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+    decoder = recurrent_group(
+        step=inner_step, name='inner', input=[subseq, seq, nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[
+        subseq, expand_layer(
+            seq, expand_as=subseq,
+            expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer(
+                nonseq,
+                expand_as=subseq,
+                expand_level=ExpandLevel.FROM_NO_SEQUENCE),
+        StaticInput(encoding)
+    ])
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+outputs(classification_cost(input=prob, label=label))
--- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+# edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+######################## data source ################################
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+encoding = embedding_layer(input=data2, size=word_dim)
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_multi_unequalength_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(data1, data2, label):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+        subseq = embedding_layer(input=data1, size=word_dim)
+        seq = embedding_layer(input=data2, size=word_dim)
+        nonseq = embedding_layer(input=label, size=word_dim)
+        print_layer(input=[data1, seq, label, inner_mem])
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+    decoder = recurrent_group(
+        step=inner_step, name='inner',
+        input=[subseq, StaticInput(seq), nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[data1, data2, StaticInput(label), StaticInput(encoding)])
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+outputs(classification_cost(input=prob, label=label))
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                        test_list=None,
                        module='rnn_data_provider',
-                        obj='process_seq2')
+                        obj='process_seq')
 settings(batch_size=2, learning_rate=0.01)

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1598,6 +1598,8 @@ TEST(Layer, FeatureMapExpandLayer) {
                              /* paraSize= */ 0});
  config.layerConfig.add_inputs();
  for (auto useGpu : {false, true}) {
+    for (auto asRowVec : {false, true}) {
+      config.layerConfig.set_user_arg(asRowVec ? "as_row_vec" : "as_col_vec");
      testLayerGrad(config,
                    "featmap_expand",
                    /*batch_size*/ 100,
@@ -1605,6 +1607,7 @@ TEST(Layer, FeatureMapExpandLayer) {
                    useGpu,
                    /* useWeight */ true);
    }
+  }
 }
 TEST(Layer, MultiplexLayer) {

--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -155,6 +155,15 @@ TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
  }
 }
+TEST(RecurrentGradientMachine, rnn_mixed_input) {
+  for (bool useGpu : {false, true}) {
+    test("gserver/tests/sequence_rnn_mixed_inputs.py",
+         "gserver/tests/sequence_rnn_matched_inputs.py",
+         1e-6,
+         useGpu);
+  }
+}
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);

--- a/paddle/majel/.gitignore
+++ b/paddle/majel/.gitignore
-build
-third-party
\ No newline at end of file
--- a/paddle/majel/dim_test.cu
+++ b/paddle/majel/dim_test.cu
-#include <thrust/device_vector.h>
-#include <sstream>
-#include "paddle/majel/dim.h"
-#include "gtest/gtest.h"
-__global__ void test(majel::Dim<2>* o) {
-    o[0] = majel::make_dim(5, 6);
-}
-__global__ void dyn_idx_gpu(int* o) {
-    auto d = majel::make_dim(5, 6);
-    o[0] = d[1];
-}
-TEST(Dim, Equality) {
-    // construct a Dim on the CPU
-    auto a = majel::make_dim(3, 4);
-    EXPECT_EQ(majel::get<0>(a), 3);
-    EXPECT_EQ(majel::get<1>(a), 4);
-    // construct a Dim on the GPU
-    thrust::device_vector<majel::Dim<2>> t(2);
-    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
-    a = t[0];
-    EXPECT_EQ(majel::get<0>(a), 5);
-    EXPECT_EQ(majel::get<1>(a), 6);
-    // linearization
-    auto b = majel::make_dim(7, 8);
-    EXPECT_EQ(majel::linearize(a, b), 83);
-    // product
-    EXPECT_EQ(majel::product(a), 30);
-    // mutate a Dim
-    majel::get<1>(b) = 10;
-    EXPECT_EQ(majel::get<0>(b), 7);
-    EXPECT_EQ(majel::get<1>(b), 10);
-    // dynamic access
-    majel::get(b, 0) = 8;
-    b[1] = 11;
-    EXPECT_EQ(majel::get<0>(b), 8);
-    EXPECT_EQ(majel::get<1>(b), 11);
-    EXPECT_EQ(majel::get(b, 0), 8);
-    EXPECT_EQ(b[1], 11);
-    // dynamic access on GPU
-    thrust::device_vector<int> r(1);
-    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
-    int res = r[0];
-    EXPECT_EQ(res, 6);
-    // ex_prefix_mul
-    majel::Dim<3> c = majel::ex_prefix_mul(majel::Dim<3>(3, 4, 5));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 3);
-    EXPECT_EQ(majel::get<2>(c), 12);
-    // contiguous_strides
-    c = majel::contiguous_strides(majel::Dim<3>(10, 1, 10));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 0);
-    EXPECT_EQ(majel::get<2>(c), 10);
-    c = majel::contiguous_strides(majel::Dim<3>(10, 10, 1));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 10);
-    EXPECT_EQ(majel::get<2>(c), 0);
-    c = majel::contiguous_strides(majel::Dim<3>(1, 10, 10));
-    EXPECT_EQ(majel::get<0>(c), 0);
-    EXPECT_EQ(majel::get<1>(c), 1);
-    EXPECT_EQ(majel::get<2>(c), 10);
-    c = majel::contiguous_strides(majel::Dim<3>(2, 3, 4));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 2);
-    EXPECT_EQ(majel::get<2>(c), 6);
-    // generate from an index
-    auto size = majel::make_dim(4, 5, 2);
-    c = majel::Dim<3>(14, size);
-    EXPECT_EQ(majel::get<0>(c), 2);
-    EXPECT_EQ(majel::get<1>(c), 3);
-    EXPECT_EQ(majel::get<2>(c), 0);
-    c = majel::Dim<3>(25, size);
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 1);
-    EXPECT_EQ(majel::get<2>(c), 1);
-}
-TEST(Dim, Bool) {
-    auto a = majel::make_dim(3, 4);
-    auto b = majel::make_dim(5, 6);
-    auto c = majel::make_dim(3, 4);
-    // in_bounds check
-    EXPECT_TRUE(majel::contained(a, b));
-    EXPECT_FALSE(majel::contained(b, a));
-    // comparison
-    EXPECT_TRUE(a == a);
-    EXPECT_FALSE(a == b);
-    EXPECT_TRUE(a == c);
-    // contiguous check
-    int x = 4, y = 5, z = 2;
-    majel::Dim<3> sizef(x, y, z);
-    majel::Dim<3> stridea(1, x, x*y);
-    majel::Dim<3> strideb(2, 2*x, 2*x*y);
-    majel::Dim<3> stridec(1, x, 2*x*y);
-    EXPECT_TRUE(majel::contiguous(sizef, stridea));
-    EXPECT_FALSE(majel::contiguous(sizef, strideb));
-    EXPECT_FALSE(majel::contiguous(sizef, stridec));
-}
-TEST(Dim, Print) {
-    {
-        std::stringstream ss;
-        auto a = majel::make_dim(2, 3);
-        ss << a;
-        EXPECT_EQ(ss.str(), "2, 3");
-    }
-    {
-        std::stringstream ss;
-        ss << majel::make_dim(8);
-        EXPECT_EQ(ss.str(), "8");
-    }
-}
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -908,12 +908,13 @@ const T* CpuGpuVectorT<T>::getData(bool useGpu) const {
 // Operation will change data and need to reset sync_ & syncFlag_.
 #define MUTABLE_VECTOR_OP(OP, useGpu, args...) \
  do {                                         \
-    setSync(useGpu);                           \
    if (useGpu) {                              \
      copyToGpu();                             \
+      setSync(useGpu);                         \
      return gpuVectorT_->OP(args);            \
    } else {                                   \
      copyToCpu();                             \
+      setSync(useGpu);                         \
      return cpuVectorT_->OP(args);            \
    }                                          \
  } while (0)
@@ -1030,7 +1031,7 @@ void CpuGpuVectorT<T>::copyToCpu() {
    case DATA_AT_GPU:
      CHECK(gpuVectorT_);
      this->resizeOrCreate(gpuVectorT_->getSize(), false);
-      cpuVectorT_->copyFrom(*gpuVectorT_, HPPL_STREAM_DEFAULT);
+      cpuVectorT_->copyFrom(*gpuVectorT_);
      setSync(SYNCED);
      break;
    case DATA_AT_CPU:
@@ -1049,7 +1050,7 @@ void CpuGpuVectorT<T>::copyToGpu() {
    case DATA_AT_CPU:
      CHECK(cpuVectorT_);
      this->resizeOrCreate(cpuVectorT_->getSize(), true);
-      gpuVectorT_->copyFrom(*cpuVectorT_, HPPL_STREAM_DEFAULT);
+      gpuVectorT_->copyFrom(*cpuVectorT_);
      setSync(SYNCED);
      break;
    case DATA_AT_GPU:

--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+set(OPITMIZER_SRCS
+    adadelta_optimizer.cc
+    adagrad_optimizer.cc
+    adam_optimizer.cc
+    optimizer.cc
+    parameter_optimizer.cc
+    sgd_optimizer.cc
+  )
+add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS})
+add_dependencies(paddle_optimizer gen_proto_cpp)
+if(WITH_TESTING)
+  add_simple_unittest(serialization_test)
+  add_simple_unittest(parameter_optimizer_test)
+endif()
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
+#include "adadelta_optimizer.h"
+#include <algorithm>
+#include <cmath>
+namespace paddle {
+namespace optimizer {
+void AdadeltaOptimizer::Update(const Tensor* gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  Tensor& param = *parameter_;
+  const Tensor& grad = *gradient;
+  Tensor& accum_g = *accum_gradient_;
+  Tensor& accum_d = *accum_delta_;
+  Tensor& update_d = *update_delta_;
+  for (size_t i = 0; i < param.size(); ++i) {
+    accum_g[i] = rho_ * accum_g[i] + (1.0 - rho_) * grad[i] * grad[i];
+    update_d[i] = std::sqrt(accum_d[i] + epsilon_) /
+                  std::sqrt(accum_g[i] + epsilon_) * grad[i];
+    accum_d[i] = rho_ * accum_d[i] + (1.0 - rho_) * update_d[i] * update_d[i];
+    param[i] -= learning_rate * update_d[i] + learning_rate * decay_ * param[i];
+  }
+}
+const char* AdadeltaOptimizer::SerializeState(int* state_len) {
+  AdadeltaOptimizerState state;
+  // TODO(zhihong) : add lr_policy serialization
+  state.set_num_sample_passed(num_sample_passed_);
+  TensorToProto(*parameter_, state.mutable_parameter());
+  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
+  TensorToProto(*accum_delta_, state.mutable_accum_delta());
+  TensorToProto(*update_delta_, state.mutable_update_delta());
+  auto str = state.SerializeAsString();
+  *state_len = str.size();
+  return str.c_str();
+}
+void AdadeltaOptimizer::DeserializeState(const std::string& str) {
+  AdadeltaOptimizerState state;
+  state.ParseFromString(str);
+  // TODO(zhihong) : add lr_policy DeserializeState
+  num_sample_passed_ = state.num_sample_passed();
+  ProtoToTensor(state.parameter(), parameter_);
+  ProtoToTensor(state.accum_gradient(), accum_gradient_);
+  ProtoToTensor(state.accum_delta(), accum_delta_);
+  ProtoToTensor(state.update_delta(), update_delta_);
+}
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
+#pragma once
+#include "parameter_optimizer.h"
+namespace paddle {
+namespace optimizer {
+class AdadeltaOptimizer : public ParameterOptimizer {
+public:
+  AdadeltaOptimizer(
+      Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
+      : ParameterOptimizer(parameter, lr),
+        accum_gradient_(new Tensor(parameter->size())),
+        accum_delta_(new Tensor(parameter->size())),
+        update_delta_(new Tensor(parameter->size())),
+        rho_(rho),
+        epsilon_(epsilon),
+        decay_(decay) {}
+  ~AdadeltaOptimizer() {
+    if (accum_gradient_) delete accum_gradient_;
+    if (accum_delta_) delete accum_delta_;
+    if (update_delta_) delete update_delta_;
+  }
+  void Update(const Tensor *gradient);
+  const char *SerializeState(int *state_len);
+  void DeserializeState(const std::string &state);
+private:
+  Tensor *accum_gradient_;
+  Tensor *accum_delta_;
+  Tensor *update_delta_;
+  double rho_;
+  double epsilon_;
+  double decay_;
+};
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
+#include <cmath>
+#include "adagrad_optimizer.h"
+namespace paddle {
+namespace optimizer {
+void AdagradOptimizer::Update(const Tensor* gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  Tensor& param = *parameter_;
+  Tensor& accum_g = *accum_gradient_;
+  const Tensor& grad = *gradient;
+  for (size_t i = 0; i < param.size(); ++i) {
+    accum_g[i] += grad[i] * grad[i];
+    param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon_) +
+                learning_rate * decay_ * param[i];
+  }
+}
+const char* AdagradOptimizer::SerializeState(int* state_len) {
+  AdagradOptimizerState state;
+  // TODO(zhihong) : add lr_policy serialization
+  state.set_num_sample_passed(num_sample_passed_);
+  TensorToProto(*parameter_, state.mutable_parameter());
+  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
+  auto str = state.SerializeAsString();
+  *state_len = str.size();
+  return str.c_str();
+}
+void AdagradOptimizer::DeserializeState(const std::string& str) {
+  AdagradOptimizerState state;
+  state.ParseFromString(str);
+  // TODO(zhihong) : add lr_policy DeserializeState
+  num_sample_passed_ = state.num_sample_passed();
+  ProtoToTensor(state.parameter(), parameter_);
+  ProtoToTensor(state.accum_gradient(), accum_gradient_);
+}
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
+#pragma once
+#include "parameter_optimizer.h"
+namespace paddle {
+namespace optimizer {
+class AdagradOptimizer : public ParameterOptimizer {
+public:
+  AdagradOptimizer(Tensor *parameter,
+                   LrPolicy *lr,
+                   double epsilon,
+                   double decay)
+      : ParameterOptimizer(parameter, lr),
+        accum_gradient_(new Tensor(parameter->size())),
+        epsilon_(epsilon),
+        decay_(decay) {}
+  ~AdagradOptimizer() {
+    if (accum_gradient_) delete accum_gradient_;
+  }
+  void Update(const Tensor *gradient);
+  const char *SerializeState(int *state_len);
+  void DeserializeState(const std::string &state);
+private:
+  Tensor *accum_gradient_;
+  double epsilon_;
+  double decay_;
+};
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
+#include "adam_optimizer.h"
+#include <cmath>
+namespace paddle {
+namespace optimizer {
+void AdamOptimizer::Update(const Tensor *gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  double coef1 = 1.0 - std::pow(beta_1_, num_sample_passed_);
+  double coef2 = 1.0 - std::pow(beta_2_, num_sample_passed_);
+  learning_rate *= std::sqrt(coef2) / coef1;
+  Tensor &param = *parameter_;
+  const Tensor &grad = *gradient;
+  Tensor &m = *momentums_;
+  Tensor &v = *velocitys_;
+  for (size_t i = 0; i < param.size(); ++i) {
+    m[i] = beta_1_ * m[i] + (1.0 - beta_1_) * grad[i];
+    v[i] = beta_2_ * v[i] + (1.0 - beta_2_) * grad[i] * grad[i];
+    param[i] -=
+        learning_rate * (m[i] / std::sqrt(v[i] + epsilon_) + decay_ * param[i]);
+  }
+}
+const char *AdamOptimizer::SerializeState(int *state_len) {
+  AdamOptimizerState state;
+  // TODO(zhihong) : add lr_policy serialization
+  state.set_num_sample_passed(num_sample_passed_);
+  TensorToProto(*parameter_, state.mutable_parameter());
+  TensorToProto(*momentums_, state.mutable_momentums());
+  TensorToProto(*velocitys_, state.mutable_velocitys());
+  auto str = state.SerializeAsString();
+  *state_len = str.size();
+  return str.c_str();
+}
+void AdamOptimizer::DeserializeState(const std::string &str) {
+  AdamOptimizerState state;
+  state.ParseFromString(str);
+  // TODO(zhihong) : add lr_policy DeserializeState
+  num_sample_passed_ = state.num_sample_passed();
+  ProtoToTensor(state.parameter(), parameter_);
+  ProtoToTensor(state.momentums(), momentums_);
+  ProtoToTensor(state.velocitys(), velocitys_);
+}
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
+#pragma once
+#include "parameter_optimizer.h"
+namespace paddle {
+namespace optimizer {
+class AdamOptimizer : public ParameterOptimizer {
+public:
+  AdamOptimizer(Tensor *parameter,
+                LrPolicy *lr,
+                double beta_1,
+                double beta_2,
+                double epsilon,
+                double decay)
+      : ParameterOptimizer(parameter, lr),
+        momentums_(new Tensor(parameter->size())),
+        velocitys_(new Tensor(parameter->size())),
+        beta_1_(beta_1),
+        beta_2_(beta_2),
+        epsilon_(epsilon),
+        decay_(decay) {}
+  ~AdamOptimizer() {
+    if (momentums_) delete momentums_;
+    if (velocitys_) delete velocitys_;
+  }
+  void Update(const Tensor *gradient);
+  const char *SerializeState(int *state_len);
+  void DeserializeState(const std::string &state);
+private:
+  Tensor *momentums_;
+  Tensor *velocitys_;
+  double beta_1_;
+  double beta_2_;
+  double epsilon_;
+  double decay_;
+};
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/lr_policy.h
+++ b/paddle/optimizer/lr_policy.h
+#pragma once
+#include <algorithm>
+#include "OptimizerConfig.pb.h"
+namespace paddle {
+namespace optimizer {
+class LrPolicy {
+public:
+  virtual ~LrPolicy() {}
+  virtual double LearningRate(const uint64_t num_sample_passed) = 0;
+  virtual const char *SerializeState(int *state_len) = 0;
+  virtual void DeserializeState(const std::string &state) = 0;
+};
+// constant learning rate policy
+class ConstLr final : public LrPolicy {
+public:
+  ConstLr(double lr) : learning_rate(lr){};
+  double LearningRate(const uint64_t num_sample_passed) {
+    return learning_rate;
+  }
+  const char *SerializeState(int *state_len) { return nullptr; }
+  void DeserializeState(const std::string &state) {}
+private:
+  double learning_rate;
+};
+class LinearLr final : public LrPolicy {
+public:
+  LinearLr(double lr, double lr_decay_a, double lr_decay_b)
+      : learning_rate(lr), lr_decay_a(lr_decay_a), lr_decay_b(lr_decay_b) {}
+  double LearningRate(const uint64_t num_sample_passed) {
+    return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b);
+  }
+  const char *SerializeState(int *state_len) {
+    // TODO(zhihong) : add lr_policy serialization
+    return nullptr;
+  }
+  void DeserializeState(const std::string &state) {
+    // TODO(zhihong) : add lr_policy serialization
+  }
+private:
+  double learning_rate;
+  double lr_decay_a;
+  double lr_decay_b;
+};
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
+#include "optimizer.h"
+#include <string>
+#include "parameter_optimizer.h"
+using namespace paddle;
+using namespace paddle::optimizer;
+template <paddle_element_type VALUE>
+struct EnumToType {};
+template <class T>
+struct TypeToEnum {};
+#define MATCH_ENUM_TYPE(TYPE, ENUM)                  \
+  template <>                                        \
+  struct TypeToEnum<TYPE> {                          \
+    static paddle_element_type v() { return ENUM; }; \
+    static constexpr TYPE value = ENUM;              \
+  };                                                 \
+  template <>                                        \
+  struct EnumToType<ENUM> {                          \
+    typedef TYPE Type;                               \
+  }
+MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32);
+MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32);
+MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64);
+MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64);
+// TODO(zhihong): only implement below type, need to fix
+MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32);
+MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64);
+struct paddle_optimizer {
+  paddle::optimizer::ParameterOptimizer* impl;
+};
+paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
+                                          const int config_proto_len,
+                                          const paddle_element_type data_type,
+                                          void* param_buffer,
+                                          int num_bytes,
+                                          const char* state,
+                                          const int state_len) {
+  paddle_optimizer* optimizer = new paddle_optimizer;
+  std::string config(config_proto, config_proto + config_proto_len);
+  Tensor* parameter =
+      new Tensor(reinterpret_cast<float*>(param_buffer), num_bytes);
+  optimizer->impl = ParameterOptimizer::Create(config, parameter);
+  if (state != nullptr) {
+    std::string s(state, state + state_len);
+    optimizer->impl->DeserializeState(s);
+  }
+  return optimizer;
+}
+int paddle_release_optimizer(paddle_optimizer* o) {
+  if (o != nullptr) delete o->impl;
+  return PADDLE_SUCCESS;
+}
+int paddle_update_parameter(paddle_optimizer* o,
+                            const paddle_element_type data_type,
+                            const void* grad_buffer,
+                            int num_bytes) {
+  // TOOD(zhihong): datatype not work. need to add the runtime datatype
+  auto grad_type = reinterpret_cast<const float*>(grad_buffer);
+  Tensor* gradient = new Tensor(const_cast<float*>(grad_type), num_bytes);
+  o->impl->Update(gradient);
+  return PADDLE_SUCCESS;
+}
+int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer) {
+  int param_size = 0;
+  *param_buffer = (void*)o->impl->get_weight(&param_size);
+  return param_size;
+}
+int paddle_optimizer_get_state(paddle_optimizer* o, const char** state) {
+  int state_len = 0;
+  *state = o->impl->SerializeState(&state_len);
+  return state_len;
+}
--- a/paddle/optimizer/optimizer.h
+++ b/paddle/optimizer/optimizer.h
+#pragma once
+#include <stdbool.h>
+#include <stdint.h>
+/**
+ * @brief optimizer library in independent with other module
+ * which will be used in :
+ * Case A, the gradient optimized locally on the trainer.
+ *
+ * Case B, the gradient optimized on the parameter server.
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32 = 0,
+  PADDLE_ELEMENT_TYPE_UINT32 = 1,
+  PADDLE_ELEMENT_TYPE_INT64 = 2,
+  PADDLE_ELEMENT_TYPE_UINT64 = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+/**
+ * @brief execution status code
+ */
+const int32_t PADDLE_SUCCESS = 0;
+const int32_t PADDLE_ERROR = -1;
+typedef struct paddle_optimizer paddle_optimizer;
+/**
+ * this group interface called in order :
+ * 1. create optimizer with config
+ * 2. set weights
+ * 3. update_parameter
+ * 4. get_weights
+ * 5. release optimizer
+ */
+/**
+ *  @brief create optimizer with proto_config
+ *  @param config_proto, optimizer protobuf, see OptimizerConfig.proto in detail
+ *  @return return optimizer instance
+ */
+paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
+                                          const int config_proto_len,
+                                          const paddle_element_type data_type,
+                                          void* param_buffer,
+                                          int num_bytes,
+                                          const char* state,
+                                          const int state_len);
+/**
+ *  @brief release optimizer
+ *  @param optimizer
+ *  @return return exec status
+ */
+int paddle_release_optimizer(paddle_optimizer* o);
+/**
+ *  @brief optimizer instance
+ *  @param datatype of gradient and parameter
+ *  @param gradient, calculate by optimzizer caller.
+ *       TODO(zhihong): just pass loss to reduce communicate overhead.
+ *                     Project Adam Ms'14 paper for detail
+ *  @param num_bytes, gradient size
+ *  @return return exec status
+ */
+int paddle_update_parameter(paddle_optimizer* o,
+                            const paddle_element_type data_type,
+                            const void* gradient,
+                            int num_bytes);
+/**
+ *  @brief optimizer for get parameter buffer
+ *  @param param_buffer, initilized parameter buffer
+ *  @return return content length
+ */
+int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer);
+/**
+ *  @brief optimzizer for saving training state
+ *  @param training state for receive SerializeState
+ *  @return return state_buffer length
+ */
+int paddle_optimizer_get_state(paddle_optimizer* o, const char** state);
+#ifdef __cplusplus
+}
+#endif
--- a/paddle/optimizer/parameter_optimizer.cc
+++ b/paddle/optimizer/parameter_optimizer.cc
+#include <glog/logging.h>
+#include "adadelta_optimizer.h"
+#include "adagrad_optimizer.h"
+#include "adam_optimizer.h"
+#include "lr_policy.h"
+#include "sgd_optimizer.h"
+#include "parameter_optimizer.h"
+namespace paddle {
+namespace optimizer {
+ParameterOptimizer *ParameterOptimizer::Create(const std::string &config_proto,
+                                               Tensor *parameter) {
+  paddle::OptimizerConfig config;
+  CHECK(config.ParseFromString(config_proto) == true)
+      << "failed parse optimizer config";
+  auto select_lr_policy = [=](const OptimizerConfig &config) -> LrPolicy * {
+    if (config.lr_policy() == OptimizerConfig::Const)
+      return new ConstLr(config.const_lr().learning_rate());
+    if (config.lr_policy() == OptimizerConfig::Linear)
+      return new LinearLr(config.linear_lr().learning_rate(),
+                          config.linear_lr().lr_decay_a(),
+                          config.linear_lr().lr_decay_b());
+    // default
+    LOG(WARNING) << " have not select any LrPolicy. use ConstLr in default";
+    return new ConstLr(0.1);
+  };
+  LrPolicy *lr = select_lr_policy(config);
+  auto select_optimizer = [=](
+      Tensor *parameter,
+      const OptimizerConfig &config) -> ParameterOptimizer * {
+    if (config.optimizer() == OptimizerConfig::SGD) {
+      return new SGDOptimizer(parameter,
+                              lr,
+                              config.sgd().momentum(),
+                              config.sgd().decay(),
+                              config.sgd().nesterov());
+    }
+    if (config.optimizer() == OptimizerConfig::Adadelta) {
+      return new AdadeltaOptimizer(parameter,
+                                   lr,
+                                   config.adadelta().rho(),
+                                   config.adadelta().epsilon(),
+                                   config.adadelta().decay());
+    }
+    if (config.optimizer() == OptimizerConfig::Adagrad) {
+      return new AdagradOptimizer(
+          parameter, lr, config.adagrad().epsilon(), config.adagrad().decay());
+    }
+    if (config.optimizer() == OptimizerConfig::Adam) {
+      return new AdamOptimizer(parameter,
+                               lr,
+                               config.adam().beta_1(),
+                               config.adam().beta_2(),
+                               config.adam().epsilon(),
+                               config.adam().decay());
+    }
+    // default
+    LOG(WARNING)
+        << "have not select any Optimizer. use SGDOptimizer in default";
+    return new SGDOptimizer(parameter, lr, 0.0, 0.0, false);
+  };
+  return select_optimizer(parameter, config);
+}
+float *ParameterOptimizer::get_weight(int *param_size) const {
+  *param_size = (int)parameter_->size();
+  return parameter_->get_buffer();
+}
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
+#pragma once
+#include <glog/logging.h>
+#include <functional>
+#include <string>
+#include "OptimizerConfig.pb.h"
+#include "lr_policy.h"
+#include "serialization.h"
+#include "tensor.h"
+namespace paddle {
+namespace optimizer {
+class ParameterOptimizer {
+public:
+  /**
+   * @brief  update hook for algorithm need to traverse parameter more than
+   * once.
+   */
+  ParameterOptimizer(Tensor *parameter, LrPolicy *lr)
+      : parameter_(parameter), lr_policy_(lr), num_sample_passed_(0) {}
+  virtual ~ParameterOptimizer() {
+    delete parameter_;
+    delete lr_policy_;
+  }
+  static ParameterOptimizer *Create(const std::string &config_proto,
+                                    Tensor *parameter);
+  virtual void Update(const Tensor *gradient) = 0;
+  virtual float *get_weight(int *param_size) const;
+  virtual const char *SerializeState(int *state_len) = 0;
+  virtual void DeserializeState(const std::string &state) = 0;
+protected:
+  Tensor *parameter_;
+  // learning rate policy
+  LrPolicy *lr_policy_;
+  uint64_t num_sample_passed_;
+};
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
+#include "parameter_optimizer.h"
+#include <cmath>
+#include <map>
+#include <vector>
+#include "gtest/gtest.h"
+#include "lr_policy.h"
+using namespace paddle;
+using namespace paddle::optimizer;
+Tensor* FillTensor(size_t size) {
+  Tensor* param = new Tensor(size);
+  Tensor& p = *param;
+  for (size_t i = 0; i < p.size(); ++i) {
+    p[i] = (float)rand() / (float)RAND_MAX;
+  }
+  return param;
+}
+Tensor* FixedTensor(size_t size) {
+  Tensor* param = new Tensor(size);
+  Tensor& p = *param;
+  for (size_t i = 0; i < p.size(); ++i) {
+    p[i] = i;
+  }
+  return param;
+}
+class OptimizerTest : public testing::Test {
+public:
+  // init tensor shape
+  const size_t kSize = 5;
+  virtual void SetUp() {
+    CreateSGD();
+    CreateAdam();
+  }
+  virtual void TearDown() {}
+  void CreateSGD() {
+    Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(OptimizerConfig::SGD);
+    config_.mutable_sgd()->set_momentum(0.0);
+    config_.mutable_sgd()->set_decay(0.0);
+    config_.mutable_sgd()->set_nesterov(false);
+    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.mutable_const_lr()->set_learning_rate(0.1);
+    std::string str = config_.SerializeAsString();
+    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    opts_.push_back(opt);
+  }
+  void CreateAdam() {
+    Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(OptimizerConfig::Adam);
+    config_.mutable_adam()->set_beta_1(0.9);
+    config_.mutable_adam()->set_beta_2(0.1);
+    config_.mutable_adam()->set_epsilon(1e-3);
+    config_.mutable_adam()->set_decay(0.0);
+    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.mutable_const_lr()->set_learning_rate(0.1);
+    std::string str = config_.SerializeAsString();
+    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    opts_.push_back(opt);
+  }
+  void TestGetWeight() {
+    Tensor* p = FixedTensor(kSize);
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      int s = 0;
+      float* newp = (float*)opts_[i]->get_weight(&s);
+      for (size_t j = 0; j < kSize; ++j) {
+        EXPECT_EQ(newp[j], (*p)[j]);
+      }
+    }
+  }
+  void TestUpdate() {
+    Tensor* g = FixedTensor(kSize);
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      opts_[i]->Update(g);
+    }
+  }
+  void TestCheckPoint() {
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      int state_len = 0;
+      std::string state = opts_[i]->SerializeState(&state_len);
+      opts_[i]->DeserializeState(state);
+    }
+  }
+private:
+  std::vector<ParameterOptimizer*> opts_;
+  OptimizerConfig config_;
+};
+TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
+TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
+TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/optimizer/serialization.h
+++ b/paddle/optimizer/serialization.h
+#pragma once
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include "OptimizerConfig.pb.h"
+#include "paddle/utils/Logging.h"
+#include "tensor.h"
+namespace paddle {
+namespace optimizer {
+static void TensorToProto(const Tensor& tensor, TensorProto* proto) {
+  proto->set_data_type(TensorProto::PADDLE_ELEMENT_TYPE_FLOAT32);
+  std::stringstream os;
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    os << tensor[i];
+    proto->add_content(os.str());
+    os.str(std::string());
+  }
+}
+static void ProtoToTensor(const TensorProto& proto, Tensor* tensor) {
+  std::stringstream sin;
+  for (auto i = 0; i < proto.content_size(); ++i) {
+    sin << proto.content(i);
+    sin >> (*tensor)[i];
+    sin.str(std::string());
+    sin.clear();
+  }
+}
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
+#include "serialization.h"
+#include "gtest/gtest.h"
+using namespace paddle;
+using namespace paddle::optimizer;
+TEST(TensorToProto, Case1) {
+  Tensor t(3), t1(3);
+  for (size_t i = 0; i < t.size(); ++i) {
+    t[i] = i;
+    t1[i] = 0;
+  }
+  TensorProto proto;
+  TensorToProto(t, &proto);
+  ProtoToTensor(proto, &t1);
+  for (size_t i = 0; i < t1.size(); ++i) {
+    EXPECT_EQ(t1[i], t[i]);
+  }
+}
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
+#include "sgd_optimizer.h"
+#include "serialization.h"
+namespace paddle {
+namespace optimizer {
+void SGDOptimizer::Update(const Tensor *gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  float velocity = 0.0;
+  Tensor &param = *parameter_;
+  const Tensor &grad = *gradient;
+  Tensor &m = *momentums_;
+  for (size_t i = 0; i < param.size(); ++i) {
+    if (momentum_ == 0.0) {
+      velocity = -learning_rate * grad[i] - learning_rate * decay_ * param[i];
+    } else {
+      m[i] = momentum_ * m[i] - learning_rate * grad[i] -
+             learning_rate * decay_ * param[i];
+      velocity = m[i];
+    }
+    if (nesterov_) {
+      param[i] += momentum_ * velocity - learning_rate * grad[i];
+    } else {
+      param[i] += velocity;
+    }
+  }
+}
+const char *SGDOptimizer::SerializeState(int *state_len) {
+  SGDOptimizerState state;
+  state.set_num_sample_passed(num_sample_passed_);
+  TensorToProto(*parameter_, state.mutable_parameter());
+  if (momentum_ != 0.0) TensorToProto(*momentums_, state.mutable_momentums());
+  auto str = state.SerializeAsString();
+  *state_len = str.size();
+  return str.c_str();
+}
+void SGDOptimizer::DeserializeState(const std::string &str) {
+  SGDOptimizerState state;
+  state.ParseFromString(str);
+  num_sample_passed_ = state.num_sample_passed();
+  ProtoToTensor(state.parameter(), parameter_);
+  if (momentum_ != 0.0) ProtoToTensor(state.parameter(), momentums_);
+}
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
+#pragma once
+#include "parameter_optimizer.h"
+namespace paddle {
+namespace optimizer {
+class SGDOptimizer : public ParameterOptimizer {
+public:
+  SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n)
+      : ParameterOptimizer(parameter, lr),
+        momentums_(nullptr),
+        momentum_(m),
+        decay_(d),
+        nesterov_(n) {
+    if (momentum_ != 0.0) {
+      size_t size = parameter->size();
+      // TODO: fix it with align aware allocator bind to Tensor
+      momentums_ = new Tensor(size);
+    }
+  }
+  virtual ~SGDOptimizer() {
+    if (momentums_) delete momentums_;
+  }
+  void Update(const Tensor* gradient);
+  const char* SerializeState(int* state_len);
+  void DeserializeState(const std::string& state);
+private:
+  Tensor* momentums_;
+  double momentum_;
+  double decay_;
+  bool nesterov_;
+};
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/optimizer/tensor.h
+++ b/paddle/optimizer/tensor.h
+#pragma once
+/**
+ * @brief tensor used by optimizer
+ */
+#include <string.h>
+#include <memory>
+#include "paddle/utils/Common.h"
+#include "paddle/utils/Logging.h"
+namespace paddle {
+namespace optimizer {
+template <class T>
+class TensorT {
+public:
+  TensorT(size_t size) : height_(1), width_(size) {
+    data_ptr_ = std::shared_ptr<T>(new T[size], std::default_delete<T[]>());
+    data_ = data_ptr_.get();
+  }
+  TensorT(T* data, size_t size)
+      : height_(1), width_(size), data_ptr_(nullptr), data_(data) {}
+  TensorT(T* data, size_t h, size_t w)
+      : height_(h), width_(w), data_ptr_(nullptr), data_(data) {}
+  virtual ~TensorT() {}
+  T* get_buffer() { return this->data_; }
+  T& operator[](const size_t idx) {
+    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
+    return data_[idx];
+  }
+  T& operator[](const size_t idx) const {
+    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
+    return data_[idx];
+  }
+  // TODO: replace with tensorshape
+  size_t size() const { return this->width_ * this->height_; }
+protected:
+  size_t height_;
+  size_t width_;
+  std::shared_ptr<T> data_ptr_;
+  T* data_;
+};
+// TODO(zhihong): design problem of dynamic datatype, need to fix it
+typedef TensorT<float> Tensor;
+}  // namespace optimizer
+}  // namespace paddle
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -149,6 +149,7 @@ struct Argument {
                                     : getBatchSize();
  }
+  bool hasSeq() const { return sequenceStartPositions != nullptr; }
  bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
  const int* getCpuStartPositions() const {

--- a/paddle/platform/.clang-format
+++ b/paddle/platform/.clang-format
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
--- a/paddle/majel/CMakeLists.txt
+++ b/paddle/majel/CMakeLists.txt
+nv_test(cuda_test SRCS cuda_test.cu)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
-cc_library(ddim SRCS ddim.cc)
-cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
-nv_test(cuda_test SRCS cuda_test.cu)
-nv_test(dim_test SRCS dim_test.cu DEPS ddim)
--- a/paddle/majel/detail/cuda_assert.h
+++ b/paddle/majel/detail/cuda_assert.h
@@ -5,28 +5,25 @@
 #if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
 #include <stdio.h>
-#define MAJEL_ASSERT(e)                                                       \
+#define PADDLE_ASSERT(e)                                           \
  do {                                                             \
    if (!(e)) {                                                    \
-      printf(                                                                 \
+      printf("%s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, \
-          "%s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, TOSTRING(e)); \
+             TOSTRING(e));                                         \
      asm("trap;");                                                \
    }                                                              \
  } while (0)
-#define MAJEL_ASSERT_MSG(e, m)                      \
+#define PADDLE_ASSERT_MSG(e, m)                                         \
  do {                                                                  \
    if (!(e)) {                                                         \
-      printf("%s:%d Assertion `%s` failed (%s).\n", \
+      printf("%s:%d Assertion `%s` failed (%s).\n", __FILE__, __LINE__, \
-             __FILE__,                              \
+             TOSTRING(e), m);                                           \
-             __LINE__,                              \
-             TOSTRING(e),                           \
-             m);                                    \
      asm("trap;");                                                     \
    }                                                                   \
  } while (0)
 #else
 #include <assert.h>
-#define MAJEL_ASSERT(e) assert(e)
+#define PADDLE_ASSERT(e) assert(e)
-#define MAJEL_ASSERT_MSG(e, m) assert((e) && (m))
+#define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
 #endif
--- a/paddle/majel/cuda_test.cu
+++ b/paddle/majel/cuda_test.cu
--- a/paddle/majel/detail/hostdevice.h
+++ b/paddle/majel/detail/hostdevice.h
--- a/paddle/majel/place.cc
+++ b/paddle/majel/place.cc
-#include "paddle/majel/place.h"
+#include "paddle/platform/place.h"
-namespace majel {
+namespace paddle {
+namespace platform {
 namespace detail {
 class PlacePrinter : public boost::static_visitor<> {
-private:
+ public:
-  std::ostream& os_;
+  PlacePrinter(std::ostream &os) : os_(os) {}
+  void operator()(const CpuPlace &) { os_ << "CpuPlace"; }
+  void operator()(const GpuPlace &p) { os_ << "GpuPlace(" << p.device << ")"; }
-public:
+ private:
-  PlacePrinter(std::ostream& os) : os_(os) {}
+  std::ostream &os_;
-  void operator()(const CpuPlace&) { os_ << "CpuPlace"; }
-  void operator()(const GpuPlace& p) { os_ << "GpuPlace(" << p.device << ")"; }
 };
 }  // namespace detail
 static Place the_default_place;
-void set_place(const Place& place) { the_default_place = place; }
+void set_place(const Place &place) { the_default_place = place; }
+const Place &get_place() { return the_default_place; }
-const Place& get_place() { return the_default_place; }
 const GpuPlace default_gpu() { return GpuPlace(0); }
 const CpuPlace default_cpu() { return CpuPlace(); }
-bool is_gpu_place(const Place& p) {
+bool is_gpu_place(const Place &p) {
  return boost::apply_visitor(IsGpuPlace(), p);
 }
+bool is_cpu_place(const Place &p) {
-bool is_cpu_place(const Place& p) {
  return !boost::apply_visitor(IsGpuPlace(), p);
 }
-bool places_are_same_class(const Place& p1, const Place& p2) {
+bool places_are_same_class(const Place &p1, const Place &p2) {
  return is_gpu_place(p1) == is_gpu_place(p2);
 }
-std::ostream& operator<<(std::ostream& os, const majel::Place& p) {
+std::ostream &operator<<(std::ostream &os, const Place &p) {
-  majel::detail::PlacePrinter printer(os);
+  detail::PlacePrinter printer(os);
  boost::apply_visitor(printer, p);
  return os;
 }
-}  // namespace majel
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/majel/place.h
+++ b/paddle/majel/place.h
@@ -2,49 +2,48 @@
 #include <boost/variant.hpp>
 #include <iostream>
-namespace majel {
+namespace paddle {
+namespace platform {
 struct CpuPlace {
-  CpuPlace() {}  // WORKAROUND: for some reason, omitting this constructor
+  // WORKAROUND: for some reason, omitting this constructor
  // causes errors with boost 1.59 and OSX
-  // needed for variant equality comparison
+  CpuPlace() {}
-  inline bool operator==(const CpuPlace&) const { return true; }
-  inline bool operator!=(const CpuPlace&) const { return false; }
+  // needed for variant equality comparison
+  inline bool operator==(const CpuPlace &) const { return true; }
+  inline bool operator!=(const CpuPlace &) const { return false; }
 };
 struct GpuPlace {
+  GpuPlace() : GpuPlace(0) {}
  GpuPlace(int d) : device(d) {}
  // needed for variant equality comparison
-  inline bool operator==(const GpuPlace& o) const { return device == o.device; }
+  inline bool operator==(const GpuPlace &o) const { return device == o.device; }
+  inline bool operator!=(const GpuPlace &o) const { return !(*this == o); }
-  inline bool operator!=(const GpuPlace& o) const { return !(*this == o); }
-  GpuPlace() : GpuPlace(0) {}
  int device;
 };
-class IsGpuPlace : public boost::static_visitor<bool> {
+struct IsGpuPlace : public boost::static_visitor<bool> {
-public:
+  bool operator()(const CpuPlace &) const { return false; }
-  bool operator()(const CpuPlace&) const { return false; }
+  bool operator()(const GpuPlace &gpu) const { return true; }
-  bool operator()(const GpuPlace& gpu) const { return true; }
 };
 typedef boost::variant<GpuPlace, CpuPlace> Place;
-void set_place(const Place&);
+void set_place(const Place &);
+const Place &get_place();
-const Place& get_place();
 const GpuPlace default_gpu();
 const CpuPlace default_cpu();
-bool is_gpu_place(const Place&);
+bool is_gpu_place(const Place &);
-bool is_cpu_place(const Place&);
+bool is_cpu_place(const Place &);
-bool places_are_same_class(const Place&, const Place&);
+bool places_are_same_class(const Place &, const Place &);
-std::ostream& operator<<(std::ostream&, const majel::Place&);
+std::ostream &operator<<(std::ostream &, const Place &);
-}  // namespace majel
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/majel/place_test.cc
+++ b/paddle/majel/place_test.cc
-#include "paddle/majel/place.h"
+#include "paddle/platform/place.h"
 #include <sstream>
 #include "gtest/gtest.h"
 TEST(Place, Equality) {
-  majel::CpuPlace cpu;
+  paddle::platform::CpuPlace cpu;
-  majel::GpuPlace g0(0), g1(1), gg0(0);
+  paddle::platform::GpuPlace g0(0), g1(1), gg0(0);
  EXPECT_EQ(cpu, cpu);
  EXPECT_EQ(g0, g0);
@@ -13,28 +13,28 @@ TEST(Place, Equality) {
  EXPECT_NE(g0, g1);
-  EXPECT_TRUE(majel::places_are_same_class(g0, gg0));
+  EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
-  EXPECT_FALSE(majel::places_are_same_class(g0, cpu));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
 }
 TEST(Place, Default) {
-  EXPECT_TRUE(majel::is_gpu_place(majel::get_place()));
+  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place()));
-  EXPECT_TRUE(majel::is_gpu_place(majel::default_gpu()));
+  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
-  EXPECT_TRUE(majel::is_cpu_place(majel::default_cpu()));
+  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
-  majel::set_place(majel::CpuPlace());
+  paddle::platform::set_place(paddle::platform::CpuPlace());
-  EXPECT_TRUE(majel::is_cpu_place(majel::get_place()));
+  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
 }
 TEST(Place, Print) {
  {
    std::stringstream ss;
-    ss << majel::GpuPlace(1);
+    ss << paddle::platform::GpuPlace(1);
    EXPECT_EQ("GpuPlace(1)", ss.str());
  }
  {
    std::stringstream ss;
-    ss << majel::CpuPlace();
+    ss << paddle::platform::CpuPlace();
    EXPECT_EQ("CpuPlace", ss.str());
  }
 }
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -26,6 +26,13 @@ set(TRAINER_HEADERS
        ThreadParameterUpdater.h
        TrainerConfigHelper.h)
+if(NOT WITH_GOLANG)
+  list(REMOVE_ITEM TRAINER_SOURCES
+          NewRemoteParameterUpdater.cpp)
+  list(REMOVE_ITEM TRAINER_HEADERS
+          NewRemoteParameterUpdater.h)
+endif()
 add_library(paddle_trainer_lib STATIC
    ${TRAINER_SOURCES})
@@ -34,7 +41,7 @@ add_style_check_target(paddle_trainer_lib
 add_style_check_target(paddle_trainer_lib
    ${TRAINER_HEADERS})
 add_dependencies(paddle_trainer_lib
-    gen_proto_cpp paddle_pserver_cclient_lib)
+    gen_proto_cpp)
 macro(add_paddle_exe TARGET_NAME)
  add_executable(${TARGET_NAME} ${ARGN})
@@ -63,5 +70,8 @@ if(APPLE)
  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
 endif()
-target_link_libraries(paddle_trainer ${CMAKE_CURRENT_SOURCE_DIR}/libpaddle_pserver_cclient.a)
+if(WITH_GOLANG)
-target_link_libraries(paddle_trainer_lib ${CMAKE_CURRENT_SOURCE_DIR}/libpaddle_pserver_cclient.a)
+  add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
+  target_link_libraries(paddle_trainer ${CMAKE_BINARY_DIR}/go/pserver/cclient/libpaddle_pserver_cclient.a)
+  target_link_libraries(paddle_trainer_lib ${CMAKE_BINARY_DIR}/go/pserver/cclient/libpaddle_pserver_cclient.a)
+endif(WITH_GOLANG)
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -124,6 +124,8 @@ TEST(RecurrentGradientMachine, test_generation) {
                     bool beam_search) {
    FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
    for (auto useGpu : useGpuConfs) {
+      LOG(INFO) << configFile << " useGpu=" << useGpu
+                << " beam_search=" << beam_search;
      testGeneration(configFile, useGpu, hasSubseq, expRetFile);
    }
  };

--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -5,6 +5,7 @@ set(proto_filenames
    ParameterConfig.proto
    ParameterService.proto
    TrainerConfig.proto
+    OptimizerConfig.proto
    ParameterServerConfig.proto)
 set(PROTO_GEN)
@@ -35,10 +36,8 @@ foreach(filename ${proto_filenames})
        DEPENDS ${filename} ${external_project_dependencies})
 endforeach()
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/proto)
 add_custom_target(gen_proto_cpp ALL DEPENDS ${PROTO_GEN})
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
-add_library(paddle_proto STATIC
-    ${PROTO_GEN})
+add_library(paddle_proto STATIC ${PROTO_GEN})
 target_include_directories(paddle_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
--- a/proto/OptimizerConfig.proto
+++ b/proto/OptimizerConfig.proto
+syntax = "proto2";
+option optimize_for = LITE_RUNTIME;
+package paddle;
+message SGDConfig {
+  // SGD
+  // momentum: float >= 0. Parameter updates momentum.
+  // decay: float >= 0. Learning rate decay over each update.
+  // nesterov: boolean. Whether to apply Nesterov momentum.
+  optional double momentum = 21 [default = 0.0];
+  optional double decay = 23 [default = 0.0];
+  optional bool nesterov =24 [default = false];
+}
+message AdadeltaConfig {
+  // Adadelta
+  // It is recommended to leave it at the default value.
+  // rho: float >= 0.
+  // epsilon: float >= 0. Fuzz factor.
+  // decay: float >= 0. Learning rate decay over each update.
+  // reference : [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
+  optional double rho = 33 [default = 0.90];
+  optional double epsilon = 31 [default = 1e-5];
+  optional double decay = 32 [default = 0.0];
+}
+message AdagradConfig {
+// Adagrad
+// epsilon: float >= 0.
+// decay: float >= 0. Learning rate decay over each update.
+// reference : [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  optional double epsilon = 41 [default = 1e-5];
+  optional double decay = 42 [default = 0.0];
+}
+message AdamConfig {
+  // Adaj
+  // beta_1: float, 0 < beta < 1. Generally close to 1.
+  // beta_2: float, 0 < beta < 1. Generally close to 1.
+  // epsilon: float >= 0. Fuzz factor.
+  // decay: float >= 0. Learning rate decay over each update.
+  // reference : [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
+  optional double beta_1 = 41;
+  optional double beta_2 = 42;
+  optional double epsilon = 43;
+  optional double decay = 44;
+}
+message ConstLrConfig {
+  // learninRate Policy
+  optional double learning_rate = 1 [default = 1.0];
+}
+message LinearLrConfig {
+  // learninRate Policy
+  optional double learning_rate = 1 [default = 1.0];
+  optional double lr_decay_a = 2;
+  optional double lr_decay_b = 3;
+}
+message TensorProto {
+enum DataType {
+  PADDLE_ELEMENT_TYPE_INT32 = 0;
+  PADDLE_ELEMENT_TYPE_UINT32 = 1;
+  PADDLE_ELEMENT_TYPE_INT64 = 2;
+  PADDLE_ELEMENT_TYPE_UINT64 = 3;
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4;
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5;
+}
+  optional DataType data_type = 1;
+  repeated bytes content = 2;
+}
+message SGDOptimizerState {
+  // learning rate policy
+  optional double learning_rate = 101;
+  optional double lr_decay_a = 102;
+  optional double lr_decay_b = 103;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto momentums = 2;
+}
+message AdadeltaOptimizerState {
+  // learning rate policy
+  optional double learning_rate = 101;
+  optional double lr_decay_a = 102;
+  optional double lr_decay_b = 103;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto accum_gradient = 2;
+  optional TensorProto accum_delta = 3;
+  optional TensorProto update_delta = 4;
+}
+message AdagradOptimizerState {
+  // learning rate policy
+  optional double learning_rate = 101;
+  optional double lr_decay_a = 102;
+  optional double lr_decay_b = 103;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto accum_gradient = 2;
+}
+message AdamOptimizerState {
+  // learning rate policy
+  optional double learning_rate = 101;
+  optional double lr_decay_a = 102;
+  optional double lr_decay_b = 103;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto momentums = 2;
+  optional TensorProto velocitys = 3;
+}
+message OptimizerConfig {
+  enum Optimizer {
+   SGD = 1;
+   Adadelta = 2;
+   Adagrad = 3;
+   Adam = 4;
+  }
+  optional Optimizer optimizer = 1;
+  optional SGDConfig sgd = 3;
+  optional AdadeltaConfig adadelta = 4;
+  optional AdagradConfig adagrad = 5;
+  optional AdamConfig adam = 6;
+  enum LrPolicy {
+   Const = 0;
+   Linear = 1;
+  }
+  optional LrPolicy lr_policy = 11;
+  optional ConstLrConfig const_lr = 12;
+  optional LinearLrConfig linear_lr = 13;
+  // common config of optimizer
+  // gradient clip when L2 exceeding value
+  optional double clip_norm = 101;
+  // gradient clip when L1 exceeding value
+  optional double clip_value = 102;
+}
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,7 +18,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
    COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} paddle_master_shared)
+    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies})
 add_custom_target(paddle_python ALL DEPENDS
    ${OUTPUT_DIR}/.timestamp)

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -328,53 +328,33 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
    SubModelBegin(name)
    g_current_submodel.is_recurrent_layer_group = True
    g_current_submodel.reversed = seq_reversed
-    g_current_submodel.target_inlinkid = -1
    in_links_count = 0
    for linkid, link in enumerate(in_links):
        if isinstance(link, basestring):
            name = link
-            has_subseq = False
        else:
            name = link.link_name
-            has_subseq = link.has_subseq
-        # assign target_inlinkid according to target_inlinkname
-        if target_inlinkname == name:
-            g_current_submodel.target_inlinkid = linkid
-        if in_links_count == 0:
-            in_links_has_subseq = has_subseq
-        else:
-            config_assert(
-                in_links_has_subseq == has_subseq,
-                "The sequence type of in_links should be the same in RecurrentLayerGroup"
-            )
        in_links_count += 1
        layer_name = MakeLayerNameInParentSubmodel(name)
        layer = g_layer_map[layer_name]
-        if has_subseq:
-            SequenceScatterAgentLayer(name=name, size=layer.size)
-        else:
        ScatterAgentLayer(name=name, size=layer.size)
        pair = g_current_submodel.in_links.add()
        pair.layer_name = layer_name
        pair.link_name = MakeLayerNameInSubmodel(name)
-        pair.has_subseq = has_subseq
 @config_func
 def RecurrentLayerGroupSetOutLink(link):
    if isinstance(link, basestring):
        name = link
-        has_subseq = False
    else:
        name = link.link_name
-        has_subseq = link.has_subseq
    layer_name = MakeLayerNameInParentSubmodel(name)
    pair = g_current_submodel.out_links.add()
    pair.layer_name = MakeLayerNameInSubmodel(name)
    pair.link_name = layer_name
-    pair.has_subseq = has_subseq
 def RecurrentLayerGroupSetGenerator(generator=None):
@@ -389,8 +369,7 @@ def RecurrentLayerGroupBegin(name,
                             generator=None,
                             target_inlinkname="",
                             seq_reversed=False):
-    RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, seq_reversed,
+    RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, seq_reversed)
-                                            target_inlinkname)
    for link in out_links:
        RecurrentLayerGroupSetOutLink(link)
@@ -425,8 +404,6 @@ def RecurrentLayerGroupEnd(name):
        agent_name = GetLayerBaseName(pair.link_name)
        if prev_submodel.HasField("generator"):
            DataLayer(name=agent_name, size=layer.size)
-        elif pair.has_subseq:
-            SequenceGatherAgentLayer(name=agent_name, size=layer.size)
        else:
            GatherAgentLayer(name=agent_name, size=layer.size)
@@ -1651,8 +1628,14 @@ class SelectiveFCLayer(LayerBase):
 @config_layer('print')
 class PrintLayer(LayerBase):
-    def __init__(self, name, inputs):
+    def __init__(self, name, inputs, format=None):
        super(PrintLayer, self).__init__(name, 'print', 0, inputs)
+        if format is None:
+            format = "\n".join([
+                "layer=" + input.input_layer_name + " %s"
+                for input in self.inputs
+            ])
+        self.config.user_arg = format
 @config_layer('priorbox')
@@ -1949,7 +1932,6 @@ class BatchNormLayer(LayerBase):
    def __init__(self,
                 name,
                 inputs,
-                 active_type="linear",
                 bias=True,
                 use_global_stats=True,
                 moving_average_fraction=0.9,
@@ -1987,12 +1969,7 @@ class BatchNormLayer(LayerBase):
            cudnn_version >= 4007
        self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
        super(BatchNormLayer, self).__init__(
-            name,
+            name, self.layer_type, 0, inputs=inputs, **xargs)
-            self.layer_type,
-            0,
-            active_type=active_type,
-            inputs=inputs,
-            **xargs)
        if use_global_stats is not None:
            self.config.use_global_stats = use_global_stats
@@ -2253,13 +2230,6 @@ class AgentLayer(LayerBase):
            name, 'agent', size, inputs=[], device=device)
-@config_layer('sequence_agent')
-class SequenceAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(SequenceAgentLayer, self).__init__(
-            name, 'sequence_agent', size, inputs=[], device=device)
 @config_layer('gather_agent')
 class GatherAgentLayer(LayerBase):
    def __init__(self, name, size, device=None):
@@ -2274,20 +2244,6 @@ class ScatterAgentLayer(LayerBase):
            name, 'scatter_agent', size, inputs=[], device=device)
-@config_layer('sequence_gather_agent')
-class SequenceGatherAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(SequenceGatherAgentLayer, self).__init__(
-            name, 'sequence_gather_agent', size, inputs=[], device=device)
-@config_layer('sequence_scatter_agent')
-class SequenceScatterAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(SequenceScatterAgentLayer, self).__init__(
-            name, 'sequence_scatter_agent', size, inputs=[], device=device)
 @config_layer('multiplex')
 class MultiplexLayer(LayerBase):
    def __init__(self, name, inputs, size, device=None):
@@ -2303,12 +2259,12 @@ class MultiplexLayer(LayerBase):
 @config_func
-def Link(
+def Link(name, has_subseq=False):
-        name,
+    """
-        has_subseq=False, ):
+    Still keeping has_subseq for backward compatibility
+    """
    link_config = LinkConfig()
    link_config.link_name = name
-    link_config.has_subseq = has_subseq
    return link_config
@@ -2341,12 +2297,6 @@ def Memory(name,
        config_assert(name is not None, "name needs cannot be None")
        memory_name = name + "+delay1"
    agent_name = memory_name
-    if is_sequence:
-        config_assert(
-            boot_layer is not None,
-            "there must be boot_layer in network when is_sequence = True")
-        agent_layer = SequenceAgentLayer(agent_name, size)
-    else:
    agent_layer = AgentLayer(agent_name, size)
    config_assert(g_current_submodel.is_recurrent_layer_group,
                  'Memory should be used in recurrent layer group only')
@@ -2354,7 +2304,6 @@ def Memory(name,
    if name is not None:
        memory.layer_name = MakeLayerNameInSubmodel(name)
    memory.link_name = MakeLayerNameInSubmodel(agent_name)
-    memory.is_sequence = is_sequence
    options = sum((boot_layer is not None, bool(boot_bias),
                   boot_with_const_id is not None))
    config_assert(
@@ -2428,15 +2377,23 @@ class ExpandLayer(LayerBase):
 @config_layer('featmap_expand')
 class FeatMapExpandLayer(LayerBase):
-    def __init__(self, name, inputs, device=None, num_filters=None, bias=False):
+    def __init__(self,
+                 name,
+                 inputs,
+                 num_filters=None,
+                 as_row_vector=True,
+                 bias=False,
+                 **xargs):
        super(FeatMapExpandLayer, self).__init__(
-            name, 'featmap_expand', 0, inputs=inputs, device=device)
+            name, 'featmap_expand', 0, inputs=inputs, **xargs)
        config_assert(
            len(self.inputs) == 1, 'ExpandLayer takes 1 and only 1 inputs')
        if num_filters is not None:
            self.config.num_filters = num_filters
        else:
            logger.fatal("FeatMapExpandLayer must specify num_filters.")
+        if not as_row_vector:
+            self.config.user_arg = "as_col_vec"
        self.set_layer_size(self.get_input_layer(0).size * num_filters)
@@ -2446,14 +2403,12 @@ class MaxLayer(LayerBase):
                 name,
                 inputs,
                 trans_type='non-seq',
-                 active_type='linear',
                 bias=False,
                 output_max_index=None,
                 **xargs):
        super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
        config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
        self.config.trans_type = trans_type
-        self.config.active_type = active_type
        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)
            self.set_layer_size(input_layer.size)
@@ -2495,18 +2450,12 @@ class SequenceLastInstanceLayer(LayerBase):
    def __init__(self,
                 name,
                 inputs,
-                 active_type='linear',
                 trans_type='non-seq',
                 bias=False,
                 stride=-1,
                 **xargs):
        super(SequenceLastInstanceLayer, self).__init__(
-            name,
+            name, 'seqlastins', 0, inputs=inputs, **xargs)
-            'seqlastins',
-            0,
-            inputs=inputs,
-            active_type=active_type,
-            **xargs)
        config_assert(
            len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
        if trans_type == 'seq':
@@ -2522,7 +2471,6 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
    def __init__(self,
                 name,
                 inputs,
-                 active_type='linear',
                 trans_type='non-seq',
                 bias=False,
                 stride=-1,
@@ -2530,7 +2478,6 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
        super(SequenceFirstInstanceLayer, self).__init__(
            name,
            inputs=inputs,
-            active_type=active_type,
            trans_type=trans_type,
            bias=bias,
            stride=stride,
@@ -2540,14 +2487,9 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
 @config_layer('seqconcat')
 class SequenceConcatLayer(LayerBase):
-    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
+    def __init__(self, name, inputs, bias=False, **xargs):
        super(SequenceConcatLayer, self).__init__(
-            name,
+            name, 'seqconcat', 0, inputs=inputs, **xargs)
-            'seqconcat',
-            0,
-            inputs=inputs,
-            active_type=active_type,
-            **xargs)
        config_assert(
            len(inputs) == 2, 'SequenceConcatLayer must have 2 inputs')
        for input_index in xrange(len(self.inputs)):
@@ -2558,20 +2500,9 @@ class SequenceConcatLayer(LayerBase):
 @config_layer('seqreshape')
 class SequenceReshapeLayer(LayerBase):
-    def __init__(self,
+    def __init__(self, name, size, inputs, bias=False, **xargs):
-                 name,
-                 size,
-                 inputs,
-                 active_type='linear',
-                 bias=False,
-                 **xargs):
        super(SequenceReshapeLayer, self).__init__(
-            name,
+            name, 'seqreshape', size, inputs=inputs, **xargs)
-            'seqreshape',
-            size,
-            inputs=inputs,
-            active_type=active_type,
-            **xargs)
        config_assert(
            len(inputs) == 1, 'SequenceReshapeLayer must have 1 inputs')
        self.set_layer_size(size)
@@ -2580,9 +2511,9 @@ class SequenceReshapeLayer(LayerBase):
 @config_layer('subseq')
 class SubSequenceLayer(LayerBase):
-    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
+    def __init__(self, name, inputs, bias=False, **xargs):
        super(SubSequenceLayer, self).__init__(
-            name, 'subseq', 0, inputs=inputs, active_type=active_type, **xargs)
+            name, 'subseq', 0, inputs=inputs, **xargs)
        config_assert(len(inputs) == 3, 'SubSequenceLayer must have 3 inputs')
        input_layer0 = self.get_input_layer(0)
        size = input_layer0.size
@@ -2738,11 +2669,10 @@ class AverageLayer(LayerBase):
                 inputs,
                 average_strategy='average',
                 trans_type='non-seq',
-                 active_type='linear',
                 bias=False,
                 **xargs):
        super(AverageLayer, self).__init__(
-            name, 'average', 0, inputs=inputs, active_type=active_type, **xargs)
+            name, 'average', 0, inputs=inputs, **xargs)
        self.config.average_strategy = average_strategy
        self.config.trans_type = trans_type
        config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -311,18 +311,6 @@ class LayerOutput(object):
        self.outputs = outputs
        self.reverse = reverse
-    def __repr__(self):
-        """
-        Disable __repr__ for debug reason. Will be implemented when release
-        """
-        assert False, "this method should not be invoked"
-    def __str__(self):
-        """
-        Disable __str__ for debug reason. Will be implemented when release
-        """
-        assert False, "this method should not be invoked"
    def set_input(self, input):
        """
        Set the input for a memory layer. Can only be used for memory layer
@@ -976,7 +964,7 @@ def fc_layer(input,
 @wrap_name_default("print")
-def printer_layer(input, name=None):
+def printer_layer(input, format=None, name=None):
    """
    Print the output value of input layers. This layer is useful for debugging.
@@ -994,6 +982,7 @@ def printer_layer(input, name=None):
    Layer(
        name=name,
+        format=format,
        type=LayerType.PRINT_LAYER,
        inputs=[l.name for l in input], )
    # this layer don't return anything, can not be input of other layer.
@@ -1565,14 +1554,24 @@ def expand_layer(input,
 @wrap_name_default()
+@wrap_act_default(act=IdentityActivation())
 @layer_support()
-def repeat_layer(input, num_repeats, name=None, layer_attr=None):
+def repeat_layer(input,
+                 num_repeats,
+                 as_row_vector=True,
+                 act=None,
+                 name=None,
+                 layer_attr=None):
    """
-    A layer for repeating the input for num_repeats times. This is equivalent
+    A layer for repeating the input for num_repeats times.
-    to apply concat_layer() with num_repeats same input.
+    If as_row_vector:
    .. math::
-       y  = [x, x, \cdots, x]
+       y  = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
+    If not as_row_vector:
+    .. math::
+       y  = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
    The example usage is:
@@ -1585,6 +1584,14 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
    :param num_repeats: Repeat the input so many times
    :type num_repeats: int
    :param name: Layer name.
+    :param as_row_vector: True for treating input as row vector and repeating
+                          in the column direction.  This is equivalent to apply
+                          concat_layer() with num_repeats same input.
+                          False for treating input as column vector and repeating
+                          in the row direction.
+    :type as_row_vector: bool
+    :param act: Activation type.
+    :type act: BaseActivation
    :type name: basestring
    :param layer_attr: extra layer attributes.
    :type layer_attr: ExtraLayerAttribute.
@@ -1595,13 +1602,16 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
    l = Layer(
        inputs=[input.name],
        name=name,
+        active_type=act.name,
        num_filters=num_repeats,
+        as_row_vector=as_row_vector,
        type=LayerType.FEATURE_MAP_EXPAND_LAYER,
        **ExtraAttr.to_kwargs(layer_attr))
    return LayerOutput(
        name=name,
        size=l.config.size,
        layer_type=LayerType.FEATURE_MAP_EXPAND_LAYER,
+        activation=act,
        parents=[input])
@@ -2846,11 +2856,13 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
    Concat sequence a with sequence b.
    Inputs:
-      - a = [a1, a2, ..., an]
+      - a = [a1, a2, ..., am]
      - b = [b1, b2, ..., bn]
-      - Note that the length of a and b should be the same.
-    Output: [a1, b1, a2, b2, ..., an, bn]
+    Output: [a1, ..., am, b1, ..., bn]
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
    The example usage is:
@@ -2944,7 +2956,7 @@ def memory(name,
    :param memory_name: the name of the memory.
                        It is ignored when name is provided.
    :type memory_name: basestring
-    :param is_seq: is sequence for boot_layer
+    :param is_seq: DEPRECATED. is sequence for boot_layer
    :type is_seq: bool
    :param boot_layer: boot layer of memory.
    :type boot_layer: LayerOutput|None
@@ -2971,7 +2983,6 @@ def memory(name,
    memory_name = Memory(
        name,
        size,
-        is_sequence=is_seq,
        boot_layer=boot_layer.name if boot_layer is not None else None,
        boot_bias=boot_bias,
        boot_bias_active_type=boot_bias_active_type.name,
@@ -3318,19 +3329,21 @@ class StaticInput(object):
    """
    StaticInput is only used in recurrent_group which defines a read-only memory
    that can be a sequence or non-sequence.
+    :param size: DEPRECATED
+    :param is_seq: DEPRECATED
    """
    def __init__(self, input, is_seq=False, size=None):
        assert isinstance(input, LayerOutput)
        self.input = input
-        self.is_seq = is_seq
+        assert input.size is not None
-        assert input.size is not None or size is not None
        if size is not None:
-            input.size = size
+            assert input.size == size
-class SubsequenceInput(object):
+def SubsequenceInput(input):
    """
+    DEPRECATED.
    Input sequence has sub-sequence, used in recurrent_group.
    The example usage is:
@@ -3339,11 +3352,7 @@ class SubsequenceInput(object):
       input = SubsequenceInput(layer)
    """
+    return input
-    def __init__(self, input):
-        assert isinstance(input, LayerOutput)
-        assert input.size is not None
-        self.input = input
 @wrap_name_default("recurrent_group")
@@ -3407,7 +3416,8 @@ def recurrent_group(step,
                    input sequence in a reverse order.
    :type reverse: bool
-    :param targetInlink: the input layer which share info with layer group's output
+    :param targetInlink: DEPRECATED.
+                         The input layer which share info with layer group's output
                         Param input specifies multiple input layers. For
                         SubsequenceInput inputs, config should assign one input
@@ -3429,46 +3439,21 @@ def recurrent_group(step,
    model_type('recurrent_nn')
    def is_single_input(x):
-        return isinstance(x, LayerOutput) or isinstance(x, StaticInput) \
+        return isinstance(x, LayerOutput) or isinstance(x, StaticInput)
-               or isinstance(x, SubsequenceInput)
    if is_single_input(input):
        input = [input]
    assert isinstance(input, collections.Sequence)
    def is_in_links(x):
-        return isinstance(x, LayerOutput) or isinstance(x, SubsequenceInput)
+        return isinstance(x, LayerOutput)
    in_links = filter(is_in_links, input)
-    def targetInlink_in_inlinks():
-        for inlink in in_links:
-            if isinstance(inlink, SubsequenceInput):
-                if targetInlink == inlink.input:
-                    return True
-            elif targetInlink == inlink:
-                return True
-        return False
-    assert (targetInlink == None or targetInlink_in_inlinks())
-    targetInlinkName = None if targetInlink == None \
-        else targetInlink.name if isinstance(targetInlink, LayerOutput) \
-        else targetInlink.input.name
-    contains_sub_seq = [False]
-    def map_in_links(x):
-        if isinstance(x, SubsequenceInput):
-            contains_sub_seq[0] = True
-            return Link(name=x.input.name, has_subseq=True)
-        else:
-            return x.name
    RecurrentLayerGroupWithoutOutLinksBegin(
        name=name,
-        in_links=map(map_in_links, in_links),
+        in_links=map(lambda x: x.name, in_links),
-        seq_reversed=reverse,
+        seq_reversed=reverse)
-        target_inlinkname=targetInlinkName)
    in_args = []
    has_LayerOutput = False
    for each_input in input:
@@ -3476,21 +3461,13 @@ def recurrent_group(step,
        if isinstance(each_input, LayerOutput):
            in_args.append(each_input)
            has_LayerOutput = True
-        elif isinstance(each_input, SubsequenceInput):
+        else:  # StaticInput
-            in_args.append(each_input.input)
-            has_LayerOutput = True
-        else:
            mem_name = "__%s_memory__" % each_input.input.name
            mem = memory(
-                name=mem_name,
+                name=None,
-                is_seq=each_input.is_seq,
                size=each_input.input.size,
                boot_layer=each_input.input)
-            with mixed_layer(
+            mem.set_input(mem)
-                    name=mem_name,
-                    size=each_input.input.size,
-                    act=IdentityActivation()) as mix:
-                mix += identity_projection(mem)
            in_args.append(mem)
    assert (is_generating != has_LayerOutput)
@@ -3503,9 +3480,6 @@ def recurrent_group(step,
    for ot in layer_outs:
        assert isinstance(ot, LayerOutput)
        ot.reverse = reverse
-        if contains_sub_seq[0]:
-            RecurrentLayerGroupSetOutLink(Link(ot.name, has_subseq=True))
-        else:
        RecurrentLayerGroupSetOutLink(ot.name)
    RecurrentLayerGroupEnd(name=name)

--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
 #!/bin/bash
-export configs=(test_fc layer_activations projections test_print_layer
+export configs=(test_repeat_layer test_fc layer_activations projections test_print_layer
 test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
@@ -9,7 +9,7 @@ layers {
  name: "__first_seq_0__"
  type: "seqlastins"
  size: 30
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "data"
  }
@@ -21,7 +21,7 @@ layers {
  name: "__first_seq_1__"
  type: "seqlastins"
  size: 30
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "data"
  }
@@ -33,7 +33,7 @@ layers {
  name: "__last_seq_0__"
  type: "seqlastins"
  size: 30
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "data"
  }
@@ -44,7 +44,7 @@ layers {
  name: "__last_seq_1__"
  type: "seqlastins"
  size: 30
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "data"
  }
@@ -55,7 +55,7 @@ layers {
  name: "__first_seq_2__"
  type: "seqlastins"
  size: 30
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "data"
  }
@@ -67,7 +67,7 @@ layers {
  name: "__last_seq_2__"
  type: "seqlastins"
  size: 30
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "data"
  }

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
@@ -123,7 +123,7 @@ layers {
  name: "__last_seq_0__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__simple_gru_0__"
  }
@@ -134,7 +134,7 @@ layers {
  name: "__last_seq_1__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__simple_gru_1__"
  }
@@ -256,19 +256,15 @@ sub_models {
  memories {
    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
    link_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
-    is_sequence: false
  }
  in_links {
    layer_name: "__simple_gru_0___transform"
    link_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
-    has_subseq: false
  }
  out_links {
    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
    link_name: "__simple_gru_0__"
-    has_subseq: false
  }
-  target_inlinkid: -1
 }
 sub_models {
  name: "__simple_gru_1___recurrent_group"
@@ -280,18 +276,14 @@ sub_models {
  memories {
    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
    link_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
-    is_sequence: false
  }
  in_links {
    layer_name: "__simple_gru_1___transform"
    link_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
-    has_subseq: false
  }
  out_links {
    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
    link_name: "__simple_gru_1__"
-    has_subseq: false
  }
-  target_inlinkid: -1
 }
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
@@ -205,7 +205,7 @@ layers {
  name: "__last_seq_0__"
  type: "seqlastins"
  size: 100
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__lstm_group_0__"
  }
@@ -216,7 +216,7 @@ layers {
  name: "__last_seq_1__"
  type: "seqlastins"
  size: 100
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__lstm_group_1__"
  }
@@ -341,24 +341,19 @@ sub_models {
  memories {
    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
    link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
  }
  memories {
    layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
    link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
  }
  in_links {
    layer_name: "__mixed_0__"
    link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-    has_subseq: false
  }
  out_links {
    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
    link_name: "__lstm_group_0__"
-    has_subseq: false
  }
-  target_inlinkid: -1
 }
 sub_models {
  name: "__lstm_group_1___recurrent_group"
@@ -373,23 +368,18 @@ sub_models {
  memories {
    layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
    link_name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
-    is_sequence: false
  }
  memories {
    layer_name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
    link_name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
-    is_sequence: false
  }
  in_links {
    layer_name: "__mixed_1__"
    link_name: "__mixed_1__@__lstm_group_1___recurrent_group"
-    has_subseq: false
  }
  out_links {
    layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
    link_name: "__lstm_group_1__"
-    has_subseq: false
  }
-  target_inlinkid: -1
 }
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
@@ -138,7 +138,7 @@ layers {
  name: "__last_seq_0__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__recurrent_layer_0__"
  }
@@ -149,7 +149,7 @@ layers {
  name: "__first_seq_0__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__recurrent_layer_1__"
  }
@@ -161,7 +161,7 @@ layers {
  name: "__last_seq_1__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__lstmemory_0__"
  }
@@ -172,7 +172,7 @@ layers {
  name: "__first_seq_1__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__lstmemory_1__"
  }
@@ -184,7 +184,7 @@ layers {
  name: "__last_seq_2__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__gru_0__"
  }
@@ -195,7 +195,7 @@ layers {
  name: "__first_seq_2__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__gru_1__"
  }

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
@@ -12,6 +12,7 @@ layers {
  inputs {
    input_layer_name: "input"
  }
+  user_arg: "layer=input %s"
 }
 input_layer_names: "input"
 output_layer_names: "input"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__repeat_layer_0__"
+  type: "featmap_expand"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  num_filters: 10
+}
+layers {
+  name: "__repeat_layer_1__"
+  type: "featmap_expand"
+  size: 300
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "data"
+  }
+  num_filters: 10
+  user_arg: "as_col_vec"
+}
+input_layer_names: "data"
+output_layer_names: "__repeat_layer_0__"
+output_layer_names: "__repeat_layer_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__repeat_layer_0__"
+  layer_names: "__repeat_layer_1__"
+  input_layer_names: "data"
+  output_layer_names: "__repeat_layer_0__"
+  output_layer_names: "__repeat_layer_1__"
+  is_recurrent_layer_group: false
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -91,7 +91,7 @@ layers {
  name: "__last_seq_0__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "rnn_forward"
  }
@@ -140,7 +140,7 @@ layers {
  name: "__first_seq_0__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "rnn_back"
  }
@@ -155,7 +155,7 @@ layers {
 }
 layers {
  name: "sub_seq_input@__recurrent_group_2__"
-  type: "sequence_scatter_agent"
+  type: "scatter_agent"
  size: 100
  active_type: ""
 }
@@ -182,7 +182,7 @@ layers {
 }
 layers {
  name: "rnn_subseq_forward"
-  type: "sequence_gather_agent"
+  type: "gather_agent"
  size: 200
  active_type: ""
 }
@@ -190,7 +190,7 @@ layers {
  name: "__last_seq_1__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "rnn_subseq_forward"
  }
@@ -280,7 +280,7 @@ layers {
  name: "__last_seq_2__"
  type: "seqlastins"
  size: 100
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__lstm_group_0__"
  }
@@ -329,7 +329,7 @@ layers {
  name: "__last_seq_3__"
  type: "seqlastins"
  size: 100
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__gru_group_0__"
  }
@@ -378,7 +378,7 @@ layers {
  name: "__last_seq_4__"
  type: "seqlastins"
  size: 200
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "__fc_layer_0__"
  }
@@ -618,19 +618,15 @@ sub_models {
  memories {
    layer_name: "rnn_forward@__recurrent_group_0__"
    link_name: "rnn_forward+delay1@__recurrent_group_0__"
-    is_sequence: false
  }
  in_links {
    layer_name: "seq_input"
    link_name: "seq_input@__recurrent_group_0__"
-    has_subseq: false
  }
  out_links {
    layer_name: "rnn_forward@__recurrent_group_0__"
    link_name: "rnn_forward"
-    has_subseq: false
  }
-  target_inlinkid: -1
 }
 sub_models {
  name: "__recurrent_group_1__"
@@ -642,19 +638,15 @@ sub_models {
  memories {
    layer_name: "rnn_back@__recurrent_group_1__"
    link_name: "rnn_back+delay1@__recurrent_group_1__"
-    is_sequence: false
  }
  in_links {
    layer_name: "seq_input"
    link_name: "seq_input@__recurrent_group_1__"
-    has_subseq: false
  }
  out_links {
    layer_name: "rnn_back@__recurrent_group_1__"
    link_name: "rnn_back"
-    has_subseq: false
  }
-  target_inlinkid: -1
 }
 sub_models {
  name: "__recurrent_group_2__"
@@ -666,19 +658,15 @@ sub_models {
  memories {
    layer_name: "rnn_subseq_forward@__recurrent_group_2__"
    link_name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
-    is_sequence: false
  }
  in_links {
    layer_name: "sub_seq_input"
    link_name: "sub_seq_input@__recurrent_group_2__"
-    has_subseq: true
  }
  out_links {
    layer_name: "rnn_subseq_forward@__recurrent_group_2__"
    link_name: "rnn_subseq_forward"
-    has_subseq: true
  }
-  target_inlinkid: -1
 }
 sub_models {
  name: "__lstm_group_0___recurrent_group"
@@ -693,24 +681,19 @@ sub_models {
  memories {
    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
    link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
  }
  memories {
    layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
    link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
  }
  in_links {
    layer_name: "__mixed_0__"
    link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-    has_subseq: false
  }
  out_links {
    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
    link_name: "__lstm_group_0__"
-    has_subseq: false
  }
-  target_inlinkid: -1
 }
 sub_models {
  name: "__gru_group_0___recurrent_group"
@@ -722,19 +705,15 @@ sub_models {
  memories {
    layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
    link_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
-    is_sequence: false
  }
  in_links {
    layer_name: "__mixed_1__"
    link_name: "__mixed_1__@__gru_group_0___recurrent_group"
-    has_subseq: false
  }
  out_links {
    layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
    link_name: "__gru_group_0__"
-    has_subseq: false
  }
-  target_inlinkid: -1
 }
 sub_models {
  name: "__recurrent_group_3__"
@@ -746,18 +725,14 @@ sub_models {
  memories {
    layer_name: "__fc_layer_0__@__recurrent_group_3__"
    link_name: "__memory_6__@__recurrent_group_3__"
-    is_sequence: false
  }
  in_links {
    layer_name: "seq_input"
    link_name: "seq_input@__recurrent_group_3__"
-    has_subseq: false
  }
  out_links {
    layer_name: "__fc_layer_0__@__recurrent_group_3__"
    link_name: "__fc_layer_0__"
-    has_subseq: false
  }
-  target_inlinkid: -1
 }
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
@@ -27,7 +27,7 @@ layers {
  name: "__seqreshape_0__"
  type: "seqreshape"
  size: 5
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "data1"
  }

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
@@ -9,7 +9,7 @@ layers {
  name: "__seq_pooling_0__"
  type: "max"
  size: 100
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "dat_in"
  }
@@ -19,7 +19,7 @@ layers {
  name: "__seq_pooling_1__"
  type: "max"
  size: 100
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "dat_in"
  }
@@ -29,7 +29,7 @@ layers {
  name: "__seq_pooling_2__"
  type: "average"
  size: 100
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "dat_in"
  }
@@ -40,7 +40,7 @@ layers {
  name: "__seq_pooling_3__"
  type: "average"
  size: 100
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "dat_in"
  }
@@ -51,7 +51,7 @@ layers {
  name: "__seq_pooling_4__"
  type: "average"
  size: 100
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "dat_in"
  }
@@ -62,7 +62,7 @@ layers {
  name: "__seq_pooling_5__"
  type: "average"
  size: 100
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "dat_in"
  }
@@ -73,7 +73,7 @@ layers {
  name: "__seq_pooling_6__"
  type: "max"
  size: 100
-  active_type: "linear"
+  active_type: ""
  inputs {
    input_layer_name: "dat_in"
  }

--- a/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
+from paddle.trainer_config_helpers import *
+settings(batch_size=1000, learning_rate=1e-5)
+din = data_layer(name='data', size=30)
+outputs(
+    repeat_layer(
+        input=din, num_repeats=10, as_row_vector=True),
+    repeat_layer(
+        input=din, num_repeats=10, act=TanhActivation(), as_row_vector=False))
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -26,7 +26,6 @@ import evaluator
 from . import dataset
 from . import reader
 from . import plot
-from . import master
 import attr
 import op
 import pooling
@@ -57,7 +56,6 @@ __all__ = [
    'plot',
    'evaluator',
    'image',
-    'master',
 ]

--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -15,6 +15,7 @@
 import requests
 import hashlib
 import os
+import errno
 import shutil
 import sys
 import importlib
@@ -27,7 +28,12 @@ __all__ = ['DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader']
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
 if not os.path.exists(DATA_HOME):
+    try:
        os.makedirs(DATA_HOME)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+        pass
 def md5file(fname):

--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -260,7 +260,7 @@ def parse_network(output_layers, extra_layers=None):
    else:
        extra_layers = []
-    layer_names = __get_used_layers__(output_layers + extra_layers)
+    layer_names = __get_used_layers__(list(output_layers) + list(extra_layers))
    submodel_names = __get_used_submodels__(layer_names)
    submodel_names.add('root')
    evaluator_names = __get_used_evaluators__(layer_names)

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -8,8 +8,7 @@ packages=['paddle',
          'paddle.v2',
          'paddle.v2.dataset',
          'paddle.v2.reader',
-          'paddle.v2.plot',
+          'paddle.v2.plot']
-          'paddle.v2.master']
 setup_requires=["requests",
                "numpy",
@@ -25,7 +24,6 @@ setup(name='paddle',
      description='Parallel Distributed Deep Learning',
      install_requires=setup_requires,
      packages=packages,
-      package_data={'paddle.v2.master': ['libpaddle_master.so'], },
      package_dir={
          '': '${CMAKE_CURRENT_SOURCE_DIR}'
      },