diff --git a/Dockerfile b/Dockerfile
index b6f99ca539d077164c71d797a5ccda7b1b5c44ba..39af60966b6cab7d8b9e644f4ea658613f8ba518 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -30,7 +30,8 @@ RUN apt-get update && \
     python-numpy python-matplotlib gcc g++ \
     automake locales clang-format-3.8 swig doxygen cmake  \
     liblapack-dev liblapacke-dev libboost-dev \
-    clang-3.8 llvm-3.8 libclang-3.8-dev && \
+    clang-3.8 llvm-3.8 libclang-3.8-dev \
+    net-tools && \
     apt-get clean -y
 
 # Install Go
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 02a5c0b2c9be782c459a255c6ffd6ba6441f2693..48f705818b70c92adef107fd3c973ae1ab3d34bb 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -59,7 +59,7 @@ macro(add_style_check_target TARGET_NAME)
                                 "--filter=${STYLE_FILTER}"
                                 "--write-success=${CUR_GEN}" ${filename}
                     DEPENDS ${filename}
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
             endif()
         endforeach()
     endif()
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 92dce20c698acb7257321bf50c569331a13b106b..69f40df51680a104c47d9335c070c570dcaff59a 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -11,23 +11,16 @@ find_path(CUDNN_INCLUDE_DIR cudnn.h
 
 get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
 
-if(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR})
-    execute_process(
-        COMMAND uname -m COMMAND tr -d '\n'
-        OUTPUT_VARIABLE HOST_ARCH
-        RESULT_VARIABLE UNAME_RESULT)
-    if(${UNAME_RESULT})
-        set(HOST_ARCH "x86_64")
-    endif(${UNAME_RESULT})
-else(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR})
-    set(HOST_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
-endif(NOT ${CMAKE_HOST_SYSTEM_PROCESSOR})
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+    set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
 
 list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
-    ${CUDNN_ROOT}/lib/${HOST_ARCH}-linux-gnu
+    ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index cb67793cf974cb8cdd0779227e8642cf7437f7fb..2341e3785bd8e951e10e3f6bbf8a32f63e4ae44d 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -24,20 +24,25 @@ IF(NOT ${CBLAS_FOUND})
     SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
-    SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1)
+    SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
 
-    IF(ANDROID)
-        # arm_soft_fp_abi branch of OpenBLAS to support softfp
-        #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
-        SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0 libs)
-    ELSEIF(RPI)
-        # use hardfp
-        SET(OPENBLAS_COMMIT "v0.2.19")
-        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0 libs)
+    IF(CMAKE_CROSSCOMPILING)
+        IF(ANDROID)
+            # arm_soft_fp_abi branch of OpenBLAS to support softfp
+            #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
+            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
+            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+        ELSEIF(RPI)
+            # use hardfp
+            SET(OPENBLAS_COMMIT "v0.2.19")
+            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0)
+        ENDIF()
     ELSE()
         SET(OPENBLAS_COMMIT "v0.2.19")
-        SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 libs NUM_THREADS=64)
+        SET(OPTIONAL_ARGS "")
+        IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
+            SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
+        ENDIF()
     ENDIF()
 
     ExternalProject_Add(
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 052530608e6dd7bf86ae082500ad8c838dddaaea..43cd6b398b1caac55b938d576b96eb0282c00fda 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -182,7 +182,7 @@ function(go_library TARGET_NAME)
     COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
     -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
     ${go_library_SRCS}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   add_custom_target(${TARGET_NAME}_lib ALL DEPENDS ${TARGET_NAME}_timestamp ${go_library_DEPS})
   add_library(${TARGET_NAME} STATIC IMPORTED)
   set_property(TARGET ${TARGET_NAME} PROPERTY
@@ -199,7 +199,7 @@ function(go_binary TARGET_NAME)
     COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
     -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
     ${go_library_SRCS}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_binary_DEPS})
   install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
 endfunction(go_binary)
@@ -213,7 +213,7 @@ function(go_test TARGET_NAME)
     COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
     -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
     ${go_test_SRCS}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
   add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 8f0833521ce2a867c85e3056255b108e7bf1999d..59bd8b91255944a8ef702edd389214c17d0cb35d 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -130,7 +130,7 @@ recurrent_group
 ---------------
 ..  autoclass:: paddle.v2.layer.recurrent_group
     :noindex:
-    
+
 lstm_step
 ---------
 ..  autoclass:: paddle.v2.layer.lstm_step
@@ -145,12 +145,12 @@ beam_search
 ------------
 ..  autoclass:: paddle.v2.layer.beam_search
     :noindex:
-    
+
 get_output
 ----------
 ..  autoclass:: paddle.v2.layer.get_output
     :noindex:
-    
+
 Mixed Layer
 ===========
 
@@ -203,7 +203,7 @@ trans_full_matrix_projection
 ----------------------------
 ..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
     :noindex:
-    
+
 Aggregate Layers
 ================
 
@@ -449,3 +449,11 @@ dropout
 --------------
 ..  autoclass:: paddle.v2.layer.dropout
     :noindex:
+
+Activation with learnable parameter
+===================================
+
+prelu
+--------
+..  autoclass:: paddle.v2.layer.prelu
+    :noindex:
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index 0cb27f802c40ef123fdc9c6799aad3b2a5f554c0..aa418c657a4ba16cce61c030066f4d3e14e891cc 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -7,4 +7,4 @@
   build_and_install/index_cn.rst
   concepts/use_concepts_cn.rst
 
-- `深度学习入门课程 <http://book.paddlepaddle.org/>`_
+- `深度学习入门课程 <http://book.paddlepaddle.org/index.cn.html>`_
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index 9f771e93e8b63eb98e31ec12667bd1aa007af20e..be3253e3d41b99a2b696e2c5ef6463ed49680d69 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -6,4 +6,4 @@ GET STARTED
 
   build_and_install/index_en.rst
 
-- `Deep Learning 101 <http://book.paddlepaddle.org/index.en.html>`_
+- `Deep Learning 101 <http://book.paddlepaddle.org/index.html>`_
diff --git a/go/cmake/golang.cmake b/go/cmake/golang.cmake
index e73b0c865bcf066302646713fa9311b3e3489235..d38d06de2348821b21109f7dc708314da81111c5 100644
--- a/go/cmake/golang.cmake
+++ b/go/cmake/golang.cmake
@@ -39,7 +39,7 @@ function(GO_LIBRARY NAME BUILD_TYPE)
     COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
     -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
     ${CMAKE_GO_FLAGS} ${GO_SOURCE}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 
   add_custom_target(${NAME} ALL DEPENDS ${OUTPUT_DIR}/.timestamp ${ARGN})
   add_dependencies(${NAME} goGet)
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 9898dc083ebb1783a0e2ddd12afaa9c3d5a79e98..47ca1833967ee705d6558b1dad06a6335b30f03a 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -8,6 +8,7 @@ add_subdirectory(gserver)
 add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
+add_subdirectory(strings)
 
 # Do not build go directory until go cmake is working smoothly.
 # if(CMAKE_Go_COMPILER)
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 6d9365af2d14673146d9e427138bf6dd5f5b41b6..5beced3bb5a1050078f88dfd4350a2df71d27f35 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -632,7 +632,7 @@ void Argument::printValueString(std::ostream& stream,
                                 const std::string& prefix) const {
   std::unordered_map<std::string, std::string> out;
   getValueString(&out);
-  for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) {
+  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
     auto it = out.find(field);
     if (it != out.end()) {
       stream << prefix << field << ":\n" << it->second;
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 8c8ba0a2e51b85bde0544c6780b07130336a6bdd..922f25734dee0a6db7fbcfcef3d29d2bad5b7858 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -383,20 +383,23 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
   setOption(sockfd);
 
   /// Now connect to the server
-  int retry_second = 0;
-  int error = 0;
+  int retry_count = 0;
   do {
-    error = connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr));
-    if (error == ECONNREFUSED) {
+    if (connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) == 0) {
+      break;
+    }
+
+    if (errno == ECONNREFUSED) {
       LOG(WARNING) << "connection refused by pserver, try again!";
-      if (retry_second++ >= 7) {
+      if (retry_count++ >= 7) {
         LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
       }
       std::this_thread::sleep_for(std::chrono::seconds(1));
     } else {
-      PCHECK(error >= 0) << "ERROR connecting to " << serverAddr;
+      PCHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
+                         << serverPort << "errorno: " << errno;
     }
-  } while (error == ECONNREFUSED);
+  } while (errno == ECONNREFUSED);
 
   channel_.reset(new SocketChannel(sockfd, serverAddr));
   tcpRdma_ = F_TCP;
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 9f0f9f2d74db8e0b538adb8263e2844c2cf4b74f..2b48e4dc0f875be9a87797fa14885926999a5010 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -58,7 +58,7 @@ EOF
 make -j `nproc`
 if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
     pip uninstall -y py-paddle paddle || true
-    ctest -V
+    ctest --output-on-failure
 fi
 
 
diff --git a/paddle/strings/CMakeLists.txt b/paddle/strings/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e55eecd484c0e218ecd51bbd19b3eb4f6f92a25
--- /dev/null
+++ b/paddle/strings/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(stringpiece SRCS stringpiece.cc)
+cc_test(stringpiece_test SRCS stringpiece_test.cc DEPS stringpiece glog gflags)
diff --git a/paddle/strings/stringpiece.cc b/paddle/strings/stringpiece.cc
new file mode 100644
index 0000000000000000000000000000000000000000..415b3558d5dfffde26275bcb16ea3922424ca9f3
--- /dev/null
+++ b/paddle/strings/stringpiece.cc
@@ -0,0 +1,141 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/strings/stringpiece.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <iosfwd>
+#include <stdexcept>
+
+namespace paddle {
+
+StringPiece::StringPiece() : data_(NULL), size_(0) {}
+
+StringPiece::StringPiece(const char* d, size_t n) : data_(d), size_(n) {
+  if (d == NULL && n != 0)
+    throw std::invalid_argument(
+        "StringPiece requires len to be 0 for NULL data");
+}
+
+StringPiece::StringPiece(const char* s) : data_(s) {
+  size_ = (s == NULL) ? 0 : strlen(s);
+}
+
+StringPiece::StringPiece(const std::string& s)
+    : data_(s.data()), size_(s.size()) {}
+
+char StringPiece::operator[](size_t n) const {
+  if (n >= len())
+    throw std::invalid_argument("index out of StringPiece length");
+  return data_[n];
+}
+
+int Compare(StringPiece a, StringPiece b) {
+  const size_t min_len = (a.len() < b.len()) ? a.len() : b.len();
+  int r = memcmp(a.data(), b.data(), min_len);
+  if (r == 0) {
+    if (a.len() < b.len())
+      return -1;
+    else if (a.len() > b.len())
+      return 1;
+  }
+  return r;
+}
+
+bool operator==(StringPiece x, StringPiece y) {
+  return ((x.len() == y.len()) &&
+          (x.data() == y.data() || memcmp(x.data(), y.data(), x.len()) == 0));
+}
+
+bool operator!=(StringPiece x, StringPiece y) { return !(x == y); }
+
+bool operator<(StringPiece x, StringPiece y) { return Compare(x, y) < 0; }
+bool operator>(StringPiece x, StringPiece y) { return Compare(x, y) > 0; }
+
+bool operator<=(StringPiece x, StringPiece y) { return Compare(x, y) <= 0; }
+bool operator>=(StringPiece x, StringPiece y) { return Compare(x, y) >= 0; }
+
+bool HasPrefix(StringPiece s, StringPiece x) {
+  return ((s.len() >= x.len()) && (memcmp(s.data(), x.data(), x.len()) == 0));
+}
+
+bool HasSuffix(StringPiece s, StringPiece x) {
+  return ((s.len() >= x.len()) &&
+          (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == 0));
+}
+
+StringPiece SkipPrefix(StringPiece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than StringPiece length");
+  return StringPiece(s.data() + n, s.len() - n);
+}
+
+StringPiece SkipSuffix(StringPiece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than StringPiece length");
+  return StringPiece(s.data(), s.len() - n);
+}
+
+StringPiece TrimPrefix(StringPiece s, StringPiece x) {
+  return HasPrefix(s, x) ? SkipPrefix(s, x.len()) : s;
+}
+
+StringPiece TrimSuffix(StringPiece s, StringPiece x) {
+  return HasSuffix(s, x) ? SkipSuffix(s, x.len()) : s;
+}
+
+bool Contains(StringPiece s, StringPiece sub) {
+  return std::search(s.begin(), s.end(), sub.begin(), sub.end()) != s.end();
+}
+
+size_t Index(StringPiece s, StringPiece sub) {
+  auto e = std::search(s.begin(), s.end(), sub.begin(), sub.end());
+  return e != s.end() ? e - s.data() : StringPiece::npos;
+}
+
+size_t Find(StringPiece s, char c, size_t pos) {
+  if (pos >= s.len()) {
+    return StringPiece::npos;
+  }
+  const char* result =
+      reinterpret_cast<const char*>(memchr(s.data() + pos, c, s.len() - pos));
+  return result != nullptr ? result - s.data() : StringPiece::npos;
+}
+
+size_t RFind(StringPiece s, char c, size_t pos) {
+  if (s.len() == 0) return StringPiece::npos;
+  for (const char* p = s.data() + std::min(pos, s.len() - 1); p >= s.data();
+       p--) {
+    if (*p == c) {
+      return p - s.data();
+    }
+  }
+  return StringPiece::npos;
+}
+
+StringPiece SubStr(StringPiece s, size_t pos, size_t n) {
+  if (pos > s.len()) pos = s.len();
+  if (n > s.len() - pos) n = s.len() - pos;
+  return StringPiece(s.data() + pos, n);
+}
+
+std::ostream& operator<<(std::ostream& o, StringPiece piece) {
+  return o << piece.ToString();
+}
+
+}  // namespace paddle
diff --git a/paddle/strings/stringpiece.h b/paddle/strings/stringpiece.h
new file mode 100644
index 0000000000000000000000000000000000000000..89aa084a2920c6f372b771e0e407b64728ea69e4
--- /dev/null
+++ b/paddle/strings/stringpiece.h
@@ -0,0 +1,104 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#pragma once
+
+#include <string>
+
+namespace paddle {
+
+// StringPiece points into a std::string object but doesn't own the
+// string.  It is for efficient access to strings.  Like Go's string
+// type.  Not that StringPiece doesn't mutate the underlying string,
+// so it is thread-safe given that the underlying string doesn't
+// change.  Because StringPiece contains a little data members, and
+// its syntax is simple as it doesn't own/manage the string, it is
+// cheap to construct StringPieces and pass them around.
+class StringPiece {
+public:
+  static const size_t npos = static_cast<size_t>(-1);
+
+  // We provide non-explicit singleton constructors so users can
+  // pass in a "const char*" or a "string" wherever a "StringPiece"
+  // is expected.  These contructors ensure that if data_ is NULL,
+  // size_ is 0.
+  StringPiece();
+  StringPiece(const char* d, size_t n);
+  StringPiece(const char* d);
+  StringPiece(const std::string& s);
+
+  const char* data() const { return data_; }
+  size_t len() const { return size_; }
+
+  char operator[](size_t n) const;
+
+  // StringPiece doesn't own the string, so both iterator and const
+  // iterator are const char* indeed.
+  typedef const char* const_iterator;
+  typedef const char* iterator;
+  iterator begin() const { return data_; }
+  iterator end() const { return data_ + size_; }
+
+  // Return a string that contains the copy of the referenced data.
+  std::string ToString() const { return std::string(data_, size_); }
+
+private:
+  const char* data_;
+  size_t size_;
+
+  // Intentionally copyable
+};
+
+int Compare(StringPiece a, StringPiece b);
+
+bool operator==(StringPiece x, StringPiece y);
+bool operator!=(StringPiece x, StringPiece y);
+bool operator<(StringPiece x, StringPiece y);
+bool operator>(StringPiece x, StringPiece y);
+bool operator<=(StringPiece x, StringPiece y);
+bool operator>=(StringPiece x, StringPiece y);
+
+bool HasPrefix(StringPiece s, StringPiece prefix);
+bool HasSuffix(StringPiece s, StringPiece suffix);
+
+StringPiece SkipPrefix(StringPiece s, size_t n);
+StringPiece SkipSuffix(StringPiece s, size_t n);
+
+// Skip the prefix (or suffix) if it matches with the string.
+StringPiece TrimPrefix(StringPiece s, StringPiece prefix);
+StringPiece TrimSuffix(StringPiece s, StringPiece suffix);
+
+// Returns if s contains sub.  Any s except for empty s contains an
+// empty sub.
+bool Contains(StringPiece s, StringPiece sub);
+
+// Return the first occurrence of sub in s, or npos.  If both s and
+// sub is empty, it returns npos; otherwise, if only sub is empty, it
+// returns 0.
+size_t Index(StringPiece s, StringPiece sub);
+
+// Return the first occurrence of c in s[pos:end], or npos.
+size_t Find(StringPiece s, char c, size_t pos);
+
+// Search range is [0..pos] inclusive.  If pos == npos, search everything.
+size_t RFind(StringPiece s, char c, size_t pos);
+
+StringPiece SubStr(StringPiece s, size_t pos, size_t n);
+
+// allow StringPiece to be logged
+std::ostream& operator<<(std::ostream& o, StringPiece piece);
+
+}  // namespace paddle
diff --git a/paddle/strings/stringpiece_test.cc b/paddle/strings/stringpiece_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ba66a04f641c3457efa713383484491a213668f
--- /dev/null
+++ b/paddle/strings/stringpiece_test.cc
@@ -0,0 +1,293 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/strings/stringpiece.h"
+
+#include <sstream>
+
+#include "gtest/gtest.h"
+
+TEST(StringPiece, Construct) {
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(NULL, s.data());
+    EXPECT_EQ(0U, s.len());
+  }
+  { EXPECT_THROW(paddle::StringPiece s(NULL, 10000U), std::invalid_argument); }
+  {
+    paddle::StringPiece s(NULL);
+    EXPECT_EQ(0U, s.len());
+  }
+  {
+    std::string a;
+    EXPECT_EQ(0U, a.size());
+    paddle::StringPiece s(a);
+    EXPECT_EQ(0U, s.len());
+  }
+}
+
+TEST(StringPiece, CopyAndAssign) {
+  paddle::StringPiece empty;
+  EXPECT_EQ(0U, empty.len());
+
+  paddle::StringPiece a("hello");
+  paddle::StringPiece b = a;
+  EXPECT_EQ(b.len(), strlen("hello"));
+  EXPECT_EQ(a, b);
+
+  std::string storage("hello");
+  paddle::StringPiece c(storage);
+  EXPECT_EQ(a, c);
+  EXPECT_NE(a.data(), c.data());
+}
+
+TEST(StringPiece, Compare) {
+  {
+    paddle::StringPiece a("hello");
+    paddle::StringPiece b("world");
+    EXPECT_TRUE(a != b);
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a < b);
+    EXPECT_TRUE(a <= b);
+    EXPECT_FALSE(a > b);
+    EXPECT_FALSE(a >= b);
+    EXPECT_LT(Compare(a, b), 0);
+    EXPECT_GT(Compare(b, a), 0);
+  }
+  {
+    paddle::StringPiece a, b;
+    EXPECT_TRUE(a == b);
+    EXPECT_FALSE(a != b);
+    EXPECT_FALSE(a < b);
+    EXPECT_FALSE(a > b);
+    EXPECT_TRUE(a <= b);
+    EXPECT_TRUE(a >= b);
+    EXPECT_EQ(0, Compare(a, b));
+    EXPECT_EQ(0, Compare(b, a));
+  }
+}
+
+TEST(StringPiece, ToString) {
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(std::string(""), s.ToString());
+  }
+  {
+    paddle::StringPiece s(NULL);
+    EXPECT_EQ(std::string(""), s.ToString());
+  }
+  {
+    paddle::StringPiece s("hello");
+    EXPECT_EQ(std::string("hello"), s.ToString());
+  }
+}
+
+TEST(StringPiece, HasPrefixSuffix) {
+  using paddle::HasPrefix;
+  using paddle::HasSuffix;
+  {
+    paddle::StringPiece s;
+    EXPECT_FALSE(HasPrefix(s, "something"));
+    EXPECT_TRUE(HasPrefix(s, ""));
+    EXPECT_FALSE(HasSuffix(s, "something"));
+    EXPECT_TRUE(HasSuffix(s, ""));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_TRUE(HasPrefix(s, ""));
+    EXPECT_TRUE(HasPrefix(s, "a"));
+    EXPECT_TRUE(HasPrefix(s, "ap"));
+    EXPECT_TRUE(HasPrefix(s, "app"));
+
+    EXPECT_TRUE(HasSuffix(s, ""));
+    EXPECT_TRUE(HasSuffix(s, "p"));
+    EXPECT_TRUE(HasSuffix(s, "pp"));
+    EXPECT_TRUE(HasSuffix(s, "app"));
+  }
+}
+
+TEST(StringPiece, SkipPrefixSuffix) {
+  using paddle::SkipPrefix;
+  using paddle::SkipSuffix;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ("", SkipPrefix(s, 0));
+    EXPECT_THROW(SkipPrefix(s, 1), std::invalid_argument);
+
+    EXPECT_EQ("", SkipSuffix(s, 0));
+    EXPECT_THROW(SkipSuffix(s, 1), std::invalid_argument);
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ("app", SkipPrefix(s, 0));
+    EXPECT_EQ("pp", SkipPrefix(s, 1));
+    EXPECT_EQ("p", SkipPrefix(s, 2));
+    EXPECT_EQ("", SkipPrefix(s, 3));
+    EXPECT_THROW(SkipPrefix(s, 4), std::invalid_argument);
+
+    EXPECT_EQ("app", SkipSuffix(s, 0));
+    EXPECT_EQ("ap", SkipSuffix(s, 1));
+    EXPECT_EQ("a", SkipSuffix(s, 2));
+    EXPECT_EQ("", SkipSuffix(s, 3));
+    EXPECT_THROW(SkipSuffix(s, 4), std::invalid_argument);
+  }
+}
+
+TEST(StringPiece, TrimPrefixSuffix) {
+  using paddle::TrimPrefix;
+  using paddle::TrimSuffix;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ("", TrimPrefix(s, ""));
+    EXPECT_EQ("", TrimPrefix(s, "something"));
+
+    EXPECT_EQ("", TrimSuffix(s, ""));
+    EXPECT_EQ("", TrimSuffix(s, "something"));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ("app", TrimPrefix(s, ""));
+    EXPECT_EQ("pp", TrimPrefix(s, "a"));
+    EXPECT_EQ("p", TrimPrefix(s, "ap"));
+    EXPECT_EQ("", TrimPrefix(s, "app"));
+    EXPECT_EQ("app", TrimPrefix(s, "something"));
+
+    EXPECT_EQ("app", TrimSuffix(s, ""));
+    EXPECT_EQ("ap", TrimSuffix(s, "p"));
+    EXPECT_EQ("a", TrimSuffix(s, "pp"));
+    EXPECT_EQ("", TrimSuffix(s, "app"));
+    EXPECT_EQ("app", TrimSuffix(s, "something"));
+  }
+}
+
+TEST(StringPiece, Contains) {
+  using paddle::Contains;
+  {
+    paddle::StringPiece s;
+    EXPECT_FALSE(Contains(s, ""));
+    EXPECT_FALSE(Contains(s, "something"));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_TRUE(Contains(s, ""));
+    EXPECT_TRUE(Contains(s, "a"));
+    EXPECT_TRUE(Contains(s, "p"));
+    EXPECT_TRUE(Contains(s, "ap"));
+    EXPECT_TRUE(Contains(s, "pp"));
+    EXPECT_TRUE(Contains(s, "app"));
+    EXPECT_FALSE(Contains(s, "something"));
+  }
+}
+
+TEST(StringPiece, Index) {
+  using paddle::Index;
+  auto npos = paddle::StringPiece::npos;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(npos, Index(s, ""));
+    EXPECT_EQ(npos, Index(s, "something"));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ(0U, Index(s, ""));
+    EXPECT_EQ(0U, Index(s, "a"));
+    EXPECT_EQ(1U, Index(s, "p"));
+    EXPECT_EQ(0U, Index(s, "ap"));
+    EXPECT_EQ(1U, Index(s, "pp"));
+    EXPECT_EQ(0U, Index(s, "app"));
+    EXPECT_EQ(npos, Index(s, "something"));
+  }
+}
+
+TEST(StringPiece, Find) {
+  using paddle::Find;
+  auto npos = paddle::StringPiece::npos;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(npos, Find(s, 'a', 0U));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ(0U, Find(s, 'a', 0U));
+    EXPECT_EQ(1U, Find(s, 'p', 0U));
+    EXPECT_EQ(1U, Find(s, 'p', 1U));
+    EXPECT_EQ(2U, Find(s, 'p', 2U));
+    EXPECT_EQ(npos, Find(s, 'z', 2U));
+  }
+}
+
+TEST(StringPiece, RFind) {
+  using paddle::RFind;
+  auto npos = paddle::StringPiece::npos;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(npos, RFind(s, 'a', 0U));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ(2U, RFind(s, 'p', 2U));
+    EXPECT_EQ(0U, RFind(s, 'a', 2U));
+    EXPECT_EQ(1U, RFind(s, 'p', 1U));
+    EXPECT_EQ(0U, RFind(s, 'a', 0));
+    EXPECT_EQ(npos, RFind(s, 'z', 2U));
+  }
+}
+
+TEST(StringPiece, SubStr) {
+  using paddle::SubStr;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ("", SubStr(s, 0, 0));
+    EXPECT_EQ("", SubStr(s, 0, 1));
+    EXPECT_EQ("", SubStr(s, 1, 0));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ("", SubStr(s, 0, 0));
+    EXPECT_EQ("", SubStr(s, 1, 0));
+    EXPECT_EQ("", SubStr(s, 2, 0));
+    EXPECT_EQ("", SubStr(s, 3, 0));
+
+    EXPECT_EQ("a", SubStr(s, 0, 1));
+    EXPECT_EQ("p", SubStr(s, 1, 1));
+    EXPECT_EQ("p", SubStr(s, 2, 1));
+    EXPECT_EQ("", SubStr(s, 3, 1));
+
+    EXPECT_EQ("ap", SubStr(s, 0, 2));
+    EXPECT_EQ("pp", SubStr(s, 1, 2));
+    EXPECT_EQ("p", SubStr(s, 2, 2));
+    EXPECT_EQ("", SubStr(s, 3, 2));
+
+    EXPECT_EQ("app", SubStr(s, 0, 3));
+    EXPECT_EQ("pp", SubStr(s, 1, 3));
+    EXPECT_EQ("p", SubStr(s, 2, 3));
+    EXPECT_EQ("", SubStr(s, 3, 3));
+  }
+}
+
+TEST(StringPiece, StreamOutput) {
+  using paddle::StringPiece;
+
+  std::stringstream o;
+  o << StringPiece();
+  EXPECT_EQ("", o.str());
+
+  o << StringPiece("hello");
+  EXPECT_EQ("hello", o.str());
+
+  o << StringPiece();
+  EXPECT_EQ("hello", o.str());
+}
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 3be972fd39c9803af029a2c7919de27d23e0972e..c5ec5349cf0c097d7bdbb50bd6d6d3568ff32400 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -73,7 +73,6 @@ To use this from paddle_trainer, paddle_trainer should be called with
 --config_args=extension_module_name=[MODULE_NAME]
 
 '''
-
 import copy
 import logging
 import os
@@ -1731,9 +1730,10 @@ class ParameterReluLayer(LayerBase):
     def __init__(self, name, inputs, partial_sum=1, **args):
         super(ParameterReluLayer, self).__init__(
             name, self.layer_type, 0, inputs=inputs, **args)
-        config_assert(len(self.inputs) == 1)
-        config_assert(self.input_layer.size % partial_sum == 0)
         input_layer = self.get_input_layer(0)
+        config_assert(len(self.inputs) == 1, "prelu layer has only one input.")
+        config_assert(input_layer.size % partial_sum == 0,
+                      "a wrong setting for partial_sum")
         self.set_layer_size(input_layer.size)
         self.create_input_parameter(0, input_layer.size / partial_sum)
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 7b2408e43260df942d197445fdc9d8397083b29e..5320f5c32ce00f4780cea16abaee718c95707467 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -31,31 +31,31 @@ except ImportError:
 import copy
 
 __all__ = [
-    "full_matrix_projection",
-    "AggregateLevel",
-    "ExpandLevel",
-    "identity_projection",
-    "dotmul_projection",
-    "dotmul_operator",
-    "repeat_layer",
-    "seq_reshape_layer",
-    "table_projection",
-    "mixed_layer",
-    "data_layer",
-    "embedding_layer",
-    "fc_layer",
-    "grumemory",
-    "pooling_layer",
-    "lstmemory",
-    "last_seq",
-    "first_seq",
-    "cos_sim",
-    "hsigmoid",
-    "conv_projection",
-    "mse_cost",
-    "regression_cost",
+    'full_matrix_projection',
+    'AggregateLevel',
+    'ExpandLevel',
+    'identity_projection',
+    'dotmul_projection',
+    'dotmul_operator',
+    'repeat_layer',
+    'seq_reshape_layer',
+    'table_projection',
+    'mixed_layer',
+    'data_layer',
+    'embedding_layer',
+    'fc_layer',
+    'grumemory',
+    'pooling_layer',
+    'lstmemory',
+    'last_seq',
+    'first_seq',
+    'cos_sim',
+    'hsigmoid',
+    'conv_projection',
+    'mse_cost',
+    'regression_cost',
     'classification_cost',
-    "LayerOutput",
+    'LayerOutput',
     'img_conv_layer',
     'img_pool_layer',
     'batch_norm_layer',
@@ -111,6 +111,7 @@ __all__ = [
     'block_expand_layer',
     'maxout_layer',
     'out_prod_layer',
+    'printer_layer',
     'print_layer',
     'priorbox_layer',
     'cross_channel_norm_layer',
@@ -121,6 +122,7 @@ __all__ = [
     'layer_support',
     'multiplex_layer',
     'dropout_layer',
+    'prelu_layer',
 ]
 
 
@@ -129,26 +131,26 @@ class LayerType(object):
     Layer type enumerations.
     """
 
-    DATA = "data"
-    MIXED_LAYER = "mixed"
-    LSTMEMORY = "lstmemory"
-    GRUMEMORY = "gated_recurrent"
-    SEQUENCE_LAST_INSTANCE = "seqlastins"
-    SEQUENCE_FIRST_INSTANCE = "seqfirstins"
-    SEQUENCE_RESHAPE = "seqreshape"
-    POOLING_MAX = "max"
+    DATA = 'data'
+    MIXED_LAYER = 'mixed'
+    LSTMEMORY = 'lstmemory'
+    GRUMEMORY = 'gated_recurrent'
+    SEQUENCE_LAST_INSTANCE = 'seqlastins'
+    SEQUENCE_FIRST_INSTANCE = 'seqfirstins'
+    SEQUENCE_RESHAPE = 'seqreshape'
+    POOLING_MAX = 'max'
     POOLING_AVG = 'average'
-    FC_LAYER = "fc"
+    FC_LAYER = 'fc'
     COST = 'cost'
     COSINE_SIM_VEC = 'cos_vm'
     COSINE_SIM = 'cos'
     HSIGMOID = 'hsigmoid'
-    CONV_LAYER = "conv"
-    CONVTRANS_LAYER = "convt"
-    EXCONV_LAYER = "exconv"
-    EXCONVTRANS_LAYER = "exconvt"
-    CUDNNCONV_LAYER = "cudnn_conv"
-    POOL_LAYER = "pool"
+    CONV_LAYER = 'conv'
+    CONVTRANS_LAYER = 'convt'
+    EXCONV_LAYER = 'exconv'
+    EXCONVTRANS_LAYER = 'exconvt'
+    CUDNNCONV_LAYER = 'cudnn_conv'
+    POOL_LAYER = 'pool'
     BATCH_NORM_LAYER = 'batch_norm'
     NORM_LAYER = 'norm'
     SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
@@ -177,36 +179,38 @@ class LayerType(object):
     EOSID_LAYER = 'eos_id'
     RECURRENT_LAYER = 'recurrent'
 
-    CONV_SHIFT_LAYER = "conv_shift"
-    TENSOR_LAYER = "tensor"
-    SEL_FC_LAYER = "selective_fc"
-    SAMPLING_ID_LAYER = "sampling_id"
-    SLOPE_INTERCEPT_LAYER = "slope_intercept"
-    LINEAR_COMBINATION_LAYER = "convex_comb"
-    BLOCK_EXPAND = "blockexpand"
-    MAXOUT = "maxout"
-    SPP_LAYER = "spp"
-    PAD_LAYER = "pad"
-    MULTIPLEX_LAYER = "multiplex"
-
-    PRINT_LAYER = "print"
-    PRIORBOX_LAYER = "priorbox"
-
-    CTC_LAYER = "ctc"
-    WARP_CTC_LAYER = "warp_ctc"
-    CRF_LAYER = "crf"
-    CRF_DECODING_LAYER = "crf_decoding"
+    CONV_SHIFT_LAYER = 'conv_shift'
+    TENSOR_LAYER = 'tensor'
+    SEL_FC_LAYER = 'selective_fc'
+    SAMPLING_ID_LAYER = 'sampling_id'
+    SLOPE_INTERCEPT_LAYER = 'slope_intercept'
+    LINEAR_COMBINATION_LAYER = 'convex_comb'
+    BLOCK_EXPAND = 'blockexpand'
+    MAXOUT = 'maxout'
+    SPP_LAYER = 'spp'
+    PAD_LAYER = 'pad'
+    MULTIPLEX_LAYER = 'multiplex'
+
+    PRINT_LAYER = 'print'
+    PRIORBOX_LAYER = 'priorbox'
+
+    CTC_LAYER = 'ctc'
+    WARP_CTC_LAYER = 'warp_ctc'
+    CRF_LAYER = 'crf'
+    CRF_DECODING_LAYER = 'crf_decoding'
     NCE_LAYER = 'nce'
 
-    RANK_COST = "rank-cost"
-    LAMBDA_COST = "lambda_cost"
-    HUBER = "huber"
-    CROSS_ENTROPY = "multi-class-cross-entropy"
-    CROSS_ENTROPY_WITH_SELFNORM = "multi_class_cross_entropy_with_selfnorm"
-    SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy"
-    MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy"
-    SUM_COST = "sum_cost"
-    SMOOTH_L1 = "smooth_l1"
+    RANK_COST = 'rank-cost'
+    LAMBDA_COST = 'lambda_cost'
+    HUBER = 'huber'
+    CROSS_ENTROPY = 'multi-class-cross-entropy'
+    CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm'
+    SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy'
+    MULTI_BIN_LABEL_CROSS_ENTROPY = 'multi_binary_label_cross_entropy'
+    SUM_COST = 'sum_cost'
+    SMOOTH_L1 = 'smooth_l1'
+
+    PRELU = 'prelu'
 
     @staticmethod
     def is_layer_type(type_name):
@@ -970,7 +974,7 @@ def fc_layer(input,
 
 
 @wrap_name_default("print")
-def print_layer(input, name=None):
+def printer_layer(input, name=None):
     """
     Print the output value of input layers. This layer is useful for debugging.
 
@@ -992,6 +996,13 @@ def print_layer(input, name=None):
         inputs=[l.name for l in input], )
     # this layer don't return anything, can not be input of other layer.
 
+# Keep print_layer for compatibility with V1 API.
+# 'print_layer' does not work for V2 API because it will be changed to
+# 'print' for V2 API. But 'print' is a reserved key word in python.
+
+
+print_layer = printer_layer
+
 
 @wrap_name_default("priorbox")
 def priorbox_layer(input,
@@ -3851,7 +3862,6 @@ def classification_cost(input,
                         label,
                         weight=None,
                         name=None,
-                        top_k=None,
                         evaluator=classification_error_evaluator,
                         layer_attr=None):
     """
@@ -3866,8 +3876,6 @@ def classification_cost(input,
     :param weight: The weight affects the cost, namely the scale of cost.
                    It is an optional argument.
     :type weight: LayerOutput
-    :param top_k: number k in top-k error rate
-    :type top_k: int
     :param evaluator: Evaluator method.
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3895,7 +3903,7 @@ def classification_cost(input,
         assert isinstance(e.for_classification, bool)
         assert e.for_classification
 
-        e(name=e.__name__, input=input, label=label, weight=weight, top_k=top_k)
+        e(name=e.__name__, input=input, label=label, weight=weight)
 
     if not isinstance(evaluator, collections.Sequence):
         evaluator = [evaluator]
@@ -4716,7 +4724,7 @@ def ctc_layer(input,
         fc_layer with softmax activation, should be num_classes + 1. The size of ctc_layer
         should also be num_classes + 1.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -4803,7 +4811,7 @@ def warp_ctc_layer(input,
         - As a native 'softmax' activation is interated to the warp-ctc library,
           'linear' activation is expected instead in the 'input' layer.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -4864,7 +4872,7 @@ def crf_layer(input,
     A layer for calculating the cost of sequential conditional random
     field model.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -4938,7 +4946,7 @@ def crf_decoding_layer(input,
     this layer will also calculate error. output.value[i] is 1 for incorrect
     decoding or 0 for correct decoding.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -5131,7 +5139,7 @@ def rank_cost(left,
       - :math:`o_i` and :math:`o_j`: the left output and right output.
         Their dimension is one.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -5188,7 +5196,7 @@ def lambda_cost(input,
     """
     lambdaCost for lambdaRank LTR approach.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -5246,6 +5254,8 @@ def cross_entropy(input,
     """
     A loss layer for multi class entropy.
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = cross_entropy(input=input_layer,
@@ -5292,6 +5302,8 @@ def cross_entropy_with_selfnorm(input,
     A loss layer for multi class entropy with selfnorm.
     Input should be a vector of positive numbers, without normalization.
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = cross_entropy_with_selfnorm(input=input_layer,
@@ -5333,6 +5345,8 @@ def sum_cost(input, name=None, layer_attr=None):
     """
     A loss layer which calculate the sum of the input as loss
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = sum_cost(input=input_layer)
@@ -5362,6 +5376,8 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     """
     A loss layer for huber loss.
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = huber_cost(input=input_layer,
@@ -5402,6 +5418,8 @@ def multi_binary_label_cross_entropy(input,
     """
     A loss layer for multi binary label cross entropy.
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = multi_binary_label_cross_entropy(input=input_layer,
@@ -5461,6 +5479,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     More details can be found by referring to `Fast R-CNN
     <https://arxiv.org/pdf/1504.08083v2.pdf>`_
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = smooth_l1_cost(input=input_layer,
@@ -5510,6 +5530,8 @@ def multiplex_layer(input, name=None, layer_attr=None):
     where, y is output. :math:`x_{k}` is the k-th input layer and
     :math:`k = x_{0}[i] + 1`.
 
+    The example usage is:
+
     .. code-block:: python
 
        maxid = multiplex_layer(input=layers)
@@ -5544,11 +5566,6 @@ def multiplex_layer(input, name=None, layer_attr=None):
         size=l.config.size)
 
 
-############################################################################
-#                         Miscs                                            #
-############################################################################
-
-
 @wrap_name_default("dropout")
 def dropout_layer(input, dropout_rate, name=None):
     """
@@ -5565,3 +5582,64 @@ def dropout_layer(input, dropout_rate, name=None):
         act=LinearActivation(),
         bias_attr=False,
         layer_attr=ExtraAttr(drop_rate=dropout_rate))
+
+
+@wrap_name_default()
+@layer_support()
+@wrap_name_default()
+@wrap_param_attr_default()
+def prelu_layer(input,
+                name=None,
+                partial_sum=1,
+                param_attr=None,
+                layer_attr=None):
+    """
+    The Parameter Relu activation that actives outputs with a learnable weight.
+
+    Reference:
+        Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf
+
+    .. math::
+       z_i &\\quad if \\quad z_i > 0 \\\\
+       a_i * z_i  &\\quad \\mathrm{otherwise}
+
+    The example usage is:
+
+    .. code-block:: python
+
+       prelu = prelu_layer(input=layers, partial_sum=1)
+
+    :param name: Name of this layer.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param partial_sum: this parameter makes a group of inputs share a same weight.
+
+        - partial_sum = 1, indicates the element-wise activation: each element has a weight.
+        - partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share a same weight.
+        - partial_sum = number of outputs, indicates all elements share a same weight.
+
+    :type partial_sum: int
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute|None
+    :param layer_attr: Extra layer configurations. Default is None.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), 'prelu_layer only accepts one input'
+    assert isinstance(param_attr, ParameterAttribute)
+
+    l = Layer(
+        name=name,
+        type=LayerType.PRELU,
+        inputs=Input(input.name, **param_attr.attr),
+        partial_sum=partial_sum,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.PRELU,
+        parents=input,
+        size=l.config.size)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 981ccbf248391b5db4339570d918404df6033f3d..bef14bffaf648b92e608a6a18cd46d57e850550e 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -5,6 +5,7 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer)
+test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
+test_prelu_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..64d227565f2b21ff43d4391c682ca90c0f47908e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
@@ -0,0 +1,36 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__prelu_layer_0__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_0__.w0"
+  }
+}
+parameters {
+  name: "___prelu_layer_0__.w0"
+  size: 300
+  initial_mean: 0.0
+  initial_std: 0.057735026919
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "input"
+output_layer_names: "__prelu_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__prelu_layer_0__"
+  input_layer_names: "input"
+  output_layer_names: "__prelu_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e3057f323db22ffc3911cce30ec2e8bb95e3dbe
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
@@ -0,0 +1,6 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+prelu = prelu_layer(input=data)
+
+outputs(prelu)
diff --git a/python/paddle/utils/image_multiproc.py b/python/paddle/utils/image_multiproc.py
index 6ce32f7811d6be6864a567cf41bf408f422409a7..e8db525ff5c388aef1a39d8db56633d509cb4fb9 100644
--- a/python/paddle/utils/image_multiproc.py
+++ b/python/paddle/utils/image_multiproc.py
@@ -12,7 +12,7 @@ from paddle.trainer.config_parser import logger
 try:
     import cv2
 except ImportError:
-    logger.warning("OpenCV2 is not installed, using PIL to prcoess")
+    logger.warning("OpenCV2 is not installed, using PIL to process")
     cv2 = None
 
 __all__ = ["CvTransformer", "PILTransformer", "MultiProcessImageTransformer"]
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
new file mode 100644
index 0000000000000000000000000000000000000000..07c13cf719ae0c864c23fef51f0bd7d47f265759
--- /dev/null
+++ b/python/paddle/v2/dataset/flowers.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module will download dataset from
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html 
+and parse train/test set intopaddle reader creators.
+
+This set contains images of flowers belonging to 102 different categories. 
+The images were acquired by searching the web and taking pictures. There are a
+minimum of 40 images for each category.
+
+The database was used in:
+
+Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
+ number of classes.Proceedings of the Indian Conference on Computer Vision, 
+Graphics and Image Processing (2008) 
+http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
+
+"""
+import cPickle
+import itertools
+from common import download
+import tarfile
+import scipy.io as scio
+from paddle.v2.image import *
+import os
+import numpy as np
+import paddle.v2 as paddle
+from multiprocessing import cpu_count
+__all__ = ['train', 'test', 'valid']
+
+DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
+LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
+SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
+DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+
+
+def default_mapper(sample):
+    '''
+    map image bytes data to type needed by model input layer
+    '''
+    img, label = sample
+    img = paddle.image.load_image_bytes(img)
+    img = paddle.image.simple_transform(img, 256, 224, True)
+    return img.flatten().astype('float32'), label
+
+
+def reader_creator(data_file,
+                   label_file,
+                   setid_file,
+                   dataset_name,
+                   mapper=default_mapper,
+                   buffered_size=1024):
+    '''
+    1. read images from tar file and 
+        merge images into batch files in 102flowers.tgz_batch/
+    2. get a reader to read sample from batch file
+    
+    :param data_file: downloaded data file 
+    :type data_file: string
+    :param label_file: downloaded label file 
+    :type label_file: string
+    :param setid_file: downloaded setid file containing information
+                        about how to split dataset
+    :type setid_file: string
+    :param dataset_name: data set name (tstid|trnid|valid)
+    :type dataset_name: string
+    :param mapper: a function to map image bytes data to type 
+                    needed by model input layer
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: data reader
+    :rtype: callable
+    '''
+    labels = scio.loadmat(label_file)['labels'][0]
+    indexes = scio.loadmat(setid_file)[dataset_name][0]
+    img2label = {}
+    for i in indexes:
+        img = "jpg/image_%05d.jpg" % i
+        img2label[img] = labels[i - 1]
+    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
+
+    def reader():
+        for file in open(file_list):
+            file = file.strip()
+            batch = None
+            with open(file, 'r') as f:
+                batch = cPickle.load(f)
+            data = batch['data']
+            labels = batch['label']
+            for sample, label in itertools.izip(data, batch['label']):
+                yield sample, int(label)
+
+    return paddle.reader.xmap_readers(mapper, reader,
+                                      cpu_count(), buffered_size)
+
+
+def train(mapper=default_mapper, buffered_size=1024):
+    '''
+    Create flowers training set reader. 
+    It returns a reader, each sample in the reader is   
+    image pixels in [0, 1] and label in [1, 102] 
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: train data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), 'trnid', mapper,
+        buffered_size)
+
+
+def test(mapper=default_mapper, buffered_size=1024):
+    '''
+    Create flowers test set reader. 
+    It returns a reader, each sample in the reader is   
+    image pixels in [0, 1] and label in [1, 102] 
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), 'tstid', mapper,
+        buffered_size)
+
+
+def valid(mapper=default_mapper, buffered_size=1024):
+    '''
+    Create flowers validation set reader. 
+    It returns a reader, each sample in the reader is   
+    image pixels in [0, 1] and label in [1, 102] 
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), 'valid', mapper,
+        buffered_size)
+
+
+def fetch():
+    download(DATA_URL, 'flowers', DATA_MD5)
+    download(LABEL_URL, 'flowers', LABEL_MD5)
+    download(SETID_URL, 'flowers', SETID_MD5)
diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc0626f4feae287d18dfb227cc69a4174da055da
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/flowers_test.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.flowers
+import unittest
+
+
+class TestFlowers(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        size = 224 * 224 * 3
+        for l in reader():
+            self.assertEqual(l[0].size, size)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.train())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.test())
+        self.assertEqual(instances, 6149)
+        self.assertEqual(max_label_value, 102)
+
+    def test_valid(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.flowers.valid())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 85ad6984ba08440d8f8c24a6ca5842024dbafe4b..0d648e9ae697ff0373c6cdc166608d395a8d8086 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -1,14 +1,16 @@
 import numpy as np
 try:
     import cv2
-except:
-    print(
-        "import cv2 error, please install opencv-python: pip install opencv-python"
-    )
+except ImportError:
+    cv2 = None
+import os
+import tarfile
+import cPickle
 
 __all__ = [
-    "load_image", "resize_short", "to_chw", "center_crop", "random_crop",
-    "left_right_flip", "simple_transform", "load_and_transform"
+    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+    "batch_images_from_tar"
 ]
 """
 This file contains some common interfaces for image preprocess.
@@ -28,6 +30,90 @@ the image layout as follows.
 """
 
 
+def batch_images_from_tar(data_file,
+                          dataset_name,
+                          img2label,
+                          num_per_batch=1024):
+    """
+    Read images from tar file and batch them into batch file.
+    param data_file: path of image tar file
+    type data_file: string
+    param dataset_name: 'train','test' or 'valid'
+    type dataset_name: string
+    param img2label: a dic with image file name as key 
+                    and image's label as value
+    type img2label: dic
+    param num_per_batch: image number per batch file
+    type num_per_batch: int
+    return: path of list file containing paths of batch file
+    rtype: string
+    """
+    batch_dir = data_file + "_batch"
+    out_path = "%s/%s" % (batch_dir, dataset_name)
+    meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
+
+    if os.path.exists(out_path):
+        return meta_file
+    else:
+        os.makedirs(out_path)
+
+    tf = tarfile.open(data_file)
+    mems = tf.getmembers()
+    data = []
+    labels = []
+    file_id = 0
+    for mem in mems:
+        if mem.name in img2label:
+            data.append(tf.extractfile(mem).read())
+            labels.append(img2label[mem.name])
+            if len(data) == num_per_batch:
+                output = {}
+                output['label'] = labels
+                output['data'] = data
+                cPickle.dump(
+                    output,
+                    open('%s/batch_%d' % (out_path, file_id), 'w'),
+                    protocol=cPickle.HIGHEST_PROTOCOL)
+                file_id += 1
+                data = []
+                labels = []
+    if len(data) > 0:
+        output = {}
+        output['label'] = labels
+        output['data'] = data
+        cPickle.dump(
+            output,
+            open('%s/batch_%d' % (out_path, file_id), 'w'),
+            protocol=cPickle.HIGHEST_PROTOCOL)
+
+    with open(meta_file, 'a') as meta:
+        for file in os.listdir(out_path):
+            meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
+    return meta_file
+
+
+def load_image_bytes(bytes, is_color=True):
+    """
+    Load an color or gray image from bytes array.
+
+    Example usage:
+    
+    .. code-block:: python
+        with open('cat.jpg') as f:
+            im = load_image_bytes(f.read())
+
+    :param bytes: the input image bytes array.
+    :type file: str
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    """
+    flag = 1 if is_color else 0
+    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
+    img = cv2.imdecode(file_bytes, flag)
+    return img
+
+
 def load_image(file, is_color=True):
     """
     Load an color or gray image from the file path.
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 1a0e64ea7701638b83c3a02637e5d0315cf95f11..aeed9ebd7d4d64efa5d0bf1638742a485c0fa44a 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -156,6 +156,20 @@ def __get_used_layers__(output_layers):
     for layer in output_layers:
         dfs_travel(layer.full_name)
 
+    # print layer needs to be specially handled because no other
+    # layer depends on it. It is used to print the result of some
+    # layers when running the model for debug purpose. So we explicitly
+    # add a print layer to the topolty if its input is in the toplogy.
+    for layer in cp.g_config.model_config.layers:
+        if layer.type == 'print':
+            used = True
+            for inp in layer.inputs:
+                if inp.input_layer_name not in layer_names:
+                    used = False
+                    break
+            if used:
+                layer_names.add(layer.name)
+
     return layer_names
 
 
@@ -266,6 +280,14 @@ def parse_network(output_layers, extra_layers=None):
         model_config.layers.extend([l])
         if l.type == 'data':
             if l.name in model_config.output_layer_names:
+                """
+                In text generation, the outlink to save the generated word
+                indices is a data_layer defined in recurrent_group. This
+                data_layer is sure to be the output of the network in text
+                generation task, so this statement excludes such a special
+                data_layer from being inputs of the network, otherwise an error
+                will occur during data feeding.
+                """
                 continue
             model_config.input_layer_names.append(l.name)
             input_layer_names.add(l.name)
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
index 104ce9a0411413bb8fc65eedf5821f98d6acdba3..c76faa596c9fb9079cab3456b721c18ef9768e95 100644
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -14,7 +14,7 @@
 
 __all__ = [
     'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn'
+    'ComposeNotAligned', 'firstn', 'xmap_readers'
 ]
 
 import itertools
@@ -224,3 +224,74 @@ def firstn(reader, n):
             yield item
 
     return firstn_reader
+
+
+class XmapEndSignal():
+    pass
+
+
+def xmap_readers(mapper, reader, process_num, buffer_size):
+    """
+    Use multiprocess to map samples from reader by a mapper defined by user.
+    And this function contains a buffered decorator.
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param reader: the data reader to read from
+    :type reader: callable
+    :param process_num: process number to handle original sample 
+    :type process_num: int
+    :param buffer_size: max buffer size
+    :type buffer_size: int
+    :return: the decarated reader
+    :rtype: callable
+    """
+    end = XmapEndSignal()
+    in_queue = Queue(buffer_size)
+    out_queue = Queue(buffer_size)
+
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+
+    # start a read worker in a thread
+    t = Thread(target=read_worker, args=(reader, in_queue))
+    t.daemon = True
+    t.start()
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    # start several handle_workers
+    workers = []
+    for i in xrange(process_num):
+        worker = Thread(
+            target=handle_worker, args=(in_queue, out_queue, mapper))
+        worker.daemon = True
+        workers.append(worker)
+    for w in workers:
+        w.start()
+
+    def xreader():
+        sample = out_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+
+    return xreader
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
index 2d25b1a9dcaa12a7dfffe962ffab34edc0a95f1a..f2097e195f41637977e71f65f36dad005d3e7941 100644
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -164,6 +164,7 @@ class OtherLayerTest(unittest.TestCase):
         maxid = layer.max_id(input=inference)
         sampling_id = layer.sampling_id(input=inference)
         eos = layer.eos(input=maxid, eos_id=5)
+        layer.printer(maxid)
         print layer.parse_network([maxid, sampling_id, eos])
 
     def test_slicing_joining_layer(self):
diff --git a/python/setup.py.in b/python/setup.py.in
index d1c38823080fb3a5c879d8b59cb5371c07902e57..93724f918801ea706517a1df158ceb78a1c2335c 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -11,17 +11,19 @@ packages=['paddle',
           'paddle.v2.reader',
           'paddle.v2.plot']
 
+setup_requires=["requests",
+                "numpy",
+                "protobuf==3.1",
+                "matplotlib",
+                "rarfile"]
+
+if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
+    setup_requires+=["opencv-python"]
+
 setup(name='paddle',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
-      install_requires=[
-          "requests",
-          "numpy",
-          "protobuf==${PROTOBUF_VERSION}",
-          "matplotlib",
-          "opencv-python",
-          "rarfile"
-      ],
+      install_requires=setup_requires,
       packages=packages,
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}'