Merge branch 'develop' into enable_error_clipping_for_fc

dc606711 · caoying03 · 20336428 · 77604bca · dc606711 · dc606711
41 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,3 +21,10 @@
    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
    hooks:
    -   id: clang-formater
+-   repo: https://github.com/dnephin/pre-commit-golang
+    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+    hooks:
+      -   id: go-fmt
+          files: (.*\.go)
+      -   id: go-lint
+          files: (.*\.go)
--- a/.travis.yml
+++ b/.travis.yml
@@ -33,16 +33,17 @@ addons:
      - ccache
 before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
+  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
  # protobuf version.
  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
  - pip install rarfile
+  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
  - |
-    export WITH_GOLANG=ON && timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
+    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
  email:

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -17,6 +17,65 @@ INCLUDE(ExternalProject)
 FIND_PACKAGE(Protobuf QUIET)
 SET(PROTOBUF_FOUND "OFF")
+if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
+    function(protobuf_generate_python SRCS)
+        # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
+        if(NOT ARGN)
+            message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
+            return()
+        endif()
+        if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+            # Create an include path for each file specified
+            foreach(FIL ${ARGN})
+                get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+                get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        else()
+            set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+        endif()
+        if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
+            set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
+        endif()
+        if(DEFINED Protobuf_IMPORT_DIRS)
+            foreach(DIR ${Protobuf_IMPORT_DIRS})
+                get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        endif()
+        set(${SRCS})
+        foreach(FIL ${ARGN})
+            get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+            get_filename_component(FIL_WE ${FIL} NAME_WE)
+            if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
+                get_filename_component(FIL_DIR ${FIL} DIRECTORY)
+                if(FIL_DIR)
+                    set(FIL_WE "${FIL_DIR}/${FIL_WE}")
+                endif()
+            endif()
+            list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+            add_custom_command(
+                    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+                    COMMAND  ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
+                    DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
+                    COMMENT "Running Python protocol buffer compiler on ${FIL}"
+                    VERBATIM )
+        endforeach()
+        set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+    endfunction()
+endif()
 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -194,7 +194,7 @@ function(cc_test TARGET_NAME)
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_test(${TARGET_NAME} ${TARGET_NAME})
+    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction(cc_test)
@@ -281,10 +281,11 @@ function(go_library TARGET_NAME)
  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  # FIXME: link path
  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
    COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
    # Golang build source code
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
+    COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
    -o "${${TARGET_NAME}_LIB_PATH}"
    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
    # must run under GOPATH
@@ -299,11 +300,13 @@ function(go_binary TARGET_NAME)
  cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  # FIXME: link path
  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
+      COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
+      GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
    -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
-  WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
  # TODO: don't know what ${TARGET_NAME}_link does
  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
  install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
@@ -332,3 +335,12 @@ function(proto_library TARGET_NAME)
  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
 endfunction()
+function(py_proto_compile TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(py_srcs)
+  protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
+endfunction()
\ No newline at end of file
--- a/go/cmd/master/CMakeLists.txt
+++ b/go/cmd/master/CMakeLists.txt
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-go_binary(master SRC master.go)
+go_binary(master SRC master.go DEPS paddle_go_optimizer)
--- a/go/cmd/pserver/CMakeLists.txt
+++ b/go/cmd/pserver/CMakeLists.txt
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-go_binary(pserver SRCS pserver.go)
+go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer)
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
+target_link_libraries(paddle_go_optimizer stdc++ m)
 go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
 if(WITH_TESTING)
-  add_subdirectory(test)
+  # FIXME: this test requires pserver which is not managed by the test
+  # we need some kind of e2e testing machanism.
+  # add_subdirectory(test)
 endif()
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
-cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
+cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
 add_style_check_target(test_cclient test_cclient.c)
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -2,7 +2,7 @@ package pserver
 // #cgo CFLAGS: -I ../../
 // //FIXME: ldflags contain "build" path
-// #cgo LDFLAGS: ../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+// #cgo LDFLAGS: ${SRCDIR}/../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
@@ -56,8 +56,8 @@ func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer {
 func (o *optimizer) GetWeights() []byte {
 	var buffer unsafe.Pointer
-	buffer_len := C.paddle_optimizer_get_weights(o.opt, &buffer)
+	bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer)
-	return cArrayToSlice(buffer, int(buffer_len)*C.sizeof_float)
+	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
 }
 func (o *optimizer) UpdateParameter(g Gradient) error {

--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -10,8 +10,10 @@ import (
 type ElementType int
 const (
+	// AlreadyInitialized is true if pserver is initialized
 	AlreadyInitialized = "pserver already initialized"
-	Uninitialized      = "pserver not fully initialized"
+	// Uninitialized is true if pserver not fully initialized
+	Uninitialized = "pserver not fully initialized"
 )
 // Supported element types
@@ -55,7 +57,7 @@ func NewService(idx int) (*Service, error) {
 	s := &Service{
 		idx: idx,
 	}
-  s.optMap = make(map[string]*optimizer)
+	s.optMap = make(map[string]*optimizer)
 	s.initialized = make(chan struct{})
 	return s, nil
 }

--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -66,6 +66,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
    paddle_trainer_lib
    paddle_network
    paddle_parameter
+    paddle_optimizer
    paddle_math
    paddle_utils
    paddle_proto

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -9,6 +9,9 @@ cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
+py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
+# Generate an empty __init__.py to make framework_py_proto as a valid python module.
+add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+add_dependencies(framework_py_proto framework_py_proto_init)
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -25,6 +25,10 @@ namespace paddle {
 * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = average_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the average pooling
+ *              operation is then applied to each interval independently.
 * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences

--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -36,6 +36,16 @@ MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
 }
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
 void CrossChannelNormLayer::forward(PassType passType) {
  Layer::forward(passType);
  MatrixPtr inV = getInputValue(0);
@@ -51,9 +61,7 @@ void CrossChannelNormLayer::forward(PassType passType) {
  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
-  normBuffer_->zeroMem();
-  // add eps to avoid overflow
-  normBuffer_->addScalar(*normBuffer_, 1e-6);
  inV->square2(*dataBuffer_);
  for (size_t i = 0; i < batchSize; i++) {
    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
@@ -63,6 +71,8 @@ void CrossChannelNormLayer::forward(PassType passType) {
    // compute norm.
    spatialBuffer_->sumCols(*dataTmp, 1, 0);
+    // add eps to avoid overflow
+    spatialBuffer_->add(1e-6);
    spatialBuffer_->sqrt2(*spatialBuffer_);
    normTmp->copyFrom(*spatialBuffer_);
    outVTmp->copyFrom(*inVTmp);
@@ -82,6 +92,9 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
  size_t dataDim = inG->getWidth();
  size_t spatialDim = dataDim / channels_;
+  MatrixPtr inGBuffer;
+  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
  dataBuffer_->dotMul(*outG, *outV);
  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
@@ -100,22 +113,24 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
    scaleDiff_->add(*channelBuffer_, 1.);
    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
-    spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
    // scale the grad
-    inGTmp->copyFrom(*inVTmp);
+    inGBuffer->copyFrom(*inVTmp);
-    inGTmp->mulRowVector(*spatialBuffer_);
+    inGBuffer->mulRowVector(*spatialBuffer_);
    // divide by square of norm
    spatialBuffer_->dotMul(*normTmp, *normTmp);
-    inGTmp->divRowVector(*spatialBuffer_);
+    inGBuffer->divRowVector(*spatialBuffer_);
    // subtract
-    inGTmp->add(*outGTmp, -1, 1);
+    inGBuffer->add(*outGTmp, -1, 1);
    // divide by norm
-    inGTmp->divRowVector(*normTmp);
+    inGBuffer->divRowVector(*normTmp);
    // scale the diff
-    inGTmp->mulColVector(*scale_->getW());
+    inGBuffer->mulColVector(*scale_->getW());
+    inGTmp->add(*inGBuffer);
  }
  // updata scale
-  if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
+  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
  scale_->getParameterPtr()->incUpdate(callback);
 }

--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -26,6 +26,10 @@ namespace paddle {
 * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = max_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the max pooling operation is
+ *              then applied to each interval independently.
 * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences

--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -56,14 +56,4 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
  return true;
 }
-bool CrossChannelNormLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
 }  // namespace paddle
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -26,10 +26,9 @@ namespace paddle {
 * If SequenceLevel = kNonseq:
 *   Output: a sequence containing only the last instance of the input sequence
 *   If stride_ > 0:
- *      Output: a shorten sequence. The operation of getting last instance of a
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              sequence is independently performed on every slice of the input
+ *              window upon the input sequence, and getting last instance
- *              sequence, which is obtained by sliding a window with the window
+ *              operation is then applied to each interval independently.
- *              size set to stride_.
 * If SequenceLevel = kSeq:
 *   Check input sequence must has sub-sequence
 *   Output: a sequence containing only the last instance of each sub-sequence
@@ -73,8 +72,7 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
 void SequenceLastInstanceLayer::forward(PassType passType) {
  SequencePoolLayer::forward(passType);
-  auto starts = (stride_ > 0) ? stridePositions_->getData()
+  auto starts = startPositions_->getData(false);
-                              : startPositions_->getData(false);
  MatrixPtr inputValue = getInputValue(0);
  MatrixPtr outputValue = getOutputValue();

--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -72,9 +72,8 @@ void SequencePoolLayer::forward(PassType passType) {
  if (stride_ > 0) {
    CHECK_EQ(input.hasSubseq(), 0UL)
        << "sequence stride pooling is invalid for hasSubseq now";
-    output_.poolSequenceWithStride(
+    output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
-        input, stride_, &stridePositions_, reversed_);
+    newBatchSize_ = startPositions_->getSize() - 1;
-    newBatchSize_ = stridePositions_->getSize() - 1;
  }
  resetOutput(newBatchSize_, dim);

--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -28,8 +28,9 @@ namespace paddle {
 * sequence}{input[i]}
 *    If stride_ > 0:
 *        Check input sequence must not have sub-sequence
- *        Output: a shorten sequence, pooling is performed upon a small local
+ *        Output: a shorten sequence. Stride is the step size by which we slide
- *                area
+ *                a window upon the input sequence, and the pooling operation
+ *                is then applied to each interval independently.
 * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences
@@ -47,8 +48,6 @@ protected:
  size_t newBatchSize_;
  ICpuGpuVectorPtr startPositions_;
  int stride_;
-  // Store the start position of each window.
-  IVectorPtr stridePositions_;
  // Whether the input sequence is reversed or not.
  bool reversed_ = false;

--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer,
    std::vector<Argument> args;
    args.push_back(out);
-    EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed";
+    ASSERT_NEAR(0, Argument::sum(args), 1e-5) << "testBatchState failed";
    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
      start[seqId] += seqLens[seqId];
    }
@@ -465,7 +465,6 @@ void initTestLayer(TestConfig testConf,
                           ParameterConfig paraConfig) {
    paraConfig.set_name(paraName);
    paraConfig.set_size(paraSize);
-    paraConfig.set_initial_std(1);
    paraConfig.set_is_static(isStatic);
    auto para =
        std::make_shared<Parameter>(paraConfig, FLAGS_use_gpu, initialize);
@@ -499,6 +498,9 @@ void initTestLayer(TestConfig testConf,
        paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize());
        paraConfig.add_dims(testConf.layerConfig.size());
      }
+      CHECK_GE(testConf.paramInitialStd, 0);
+      paraConfig.set_initial_mean(testConf.paramInitialMean);
+      paraConfig.set_initial_std(testConf.paramInitialStd);
      initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig);
    }
  }

--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -125,12 +125,16 @@ struct TestConfig {
  LayerConfig layerConfig;
  std::vector<InputDef> inputDefs;
  size_t biasSize;
+  real paramInitialMean;
+  real paramInitialStd;
  bool testAccumulate;
  bool testState;
  bool staticBias;
  bool testBatchState;
  TestConfig()
      : biasSize(0),
+        paramInitialMean(0.0),
+        paramInitialStd(1.0),
        testAccumulate(true),
        testState(false),
        staticBias(false),

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -845,8 +845,12 @@ void testDegradeLayer(bool hasSubseq,
 TEST(Layer, MaxLayer) {
  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
-  testDegradeLayer(true, "max", "non-seq", -1);   // hasSubseq max to non-seq
+  testDegradeLayer(false,
-  testDegradeLayer(true, "max", "seq", -1);       // hasSubseq max to seq
+                   "max",
+                   "non-seq",
+                   5);  // seq max to a shorten seq, stride window = 5
+  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
 }
 TEST(Layer, SequenceLastInstanceLayer) {
@@ -868,6 +872,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
 TEST(Layer, AverageLayer) {
  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
+  testDegradeLayer(false,
+                   "average",
+                   "non-seq",
+                   5);  // seq average to a shorten seq, stride window = 5
  testDegradeLayer(
      true, "average", "non-seq", -1);           // hasSubseq average to non-seq
  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
@@ -1661,6 +1669,8 @@ TEST(Layer, PadLayer) {
 TEST(Layer, CrossChannelNormLayer) {
  TestConfig config;
+  config.paramInitialMean = 1.;
+  config.paramInitialStd = 0.;
  config.layerConfig.set_type("norm");
  config.layerConfig.set_size(100);
  LayerInputConfig* input = config.layerConfig.add_inputs();
@@ -1674,7 +1684,7 @@ TEST(Layer, CrossChannelNormLayer) {
  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
  }
 }

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -561,7 +561,7 @@ void Argument::degradeSequence(const Argument& input) {
 void Argument::poolSequenceWithStride(const Argument& input,
                                      size_t stride,
-                                      IVectorPtr* stridePostions,
+                                      ICpuGpuVectorPtr* stridePostions,
                                      bool reversed) {
  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
  // then sequenceStartPositions = [0, 2, 3, 4, 7].
@@ -598,8 +598,8 @@ void Argument::poolSequenceWithStride(const Argument& input,
  stridePos.emplace_back(starts[numSequences]);
  int size = stridePos.size();
  CHECK_EQ(size - 1, tgtBuf[numSequences]);
-  IVector::resizeOrCreate(*stridePostions, size, false);
+  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
-  (*stridePostions)->copyFrom(stridePos.data(), size);
+  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
 }
 void Argument::getValueString(

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -299,7 +299,7 @@ struct Argument {
   */
  void poolSequenceWithStride(const Argument& input,
                              size_t stride,
-                              IVectorPtr* stridePositions,
+                              ICpuGpuVectorPtr* stridePositions,
                              bool reversed = false);
  /**
   * @brief getValueString will return the argument's output in string. There

--- a/paddle/parameter/tests/test_argument.cpp
+++ b/paddle/parameter/tests/test_argument.cpp
@@ -31,7 +31,7 @@ TEST(Argument, poolSequenceWithStride) {
  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
  for (auto reversed : {false, true}) {
-    IVectorPtr stridePositions;
+    ICpuGpuVectorPtr stridePositions;
    output.poolSequenceWithStride(
        input, 5 /* stride */, &stridePositions, reversed);
@@ -45,7 +45,7 @@ TEST(Argument, poolSequenceWithStride) {
    CHECK_EQ(stridePositions->getSize(), 8UL);
    auto result = reversed ? strideResultReversed : strideResult;
    for (int i = 0; i < 8; i++) {
-      CHECK_EQ(stridePositions->getData()[i], result[i]);
+      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
    }
  }
 }

--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -142,7 +142,7 @@ SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
  }
  /// trigger to initialize RDMA lib
-  PCHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
+  CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
 }
 SocketServer::~SocketServer() {
@@ -168,7 +168,7 @@ void SocketServer::tcpServer() {
  /// First call to socket() function
  socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(socket_ >= 0) << "ERROR opening socket";
+  CHECK(socket_ >= 0) << "ERROR opening socket";
  /// Initialize socket structure
  bzero((char *)&serv_addr, sizeof(serv_addr));
@@ -176,7 +176,7 @@ void SocketServer::tcpServer() {
  serv_addr.sin_port = htons(port_);
  if (!addr_.empty()) {
    server = gethostbyname(addr_.c_str());
-    PCHECK(server) << "ERROR, no such host: " << addr_;
+    CHECK(server) << "ERROR, no such host: " << addr_;
    bcopy((char *)server->h_addr,
          (char *)&serv_addr.sin_addr.s_addr,
          server->h_length);
@@ -187,7 +187,7 @@ void SocketServer::tcpServer() {
  setOption(socket_);
  /// Now bind the host address using bind() call.
-  PCHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
+  CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
      << "ERROR on binding " << addr_;
  /// Now start listening for the clients, here process will
@@ -201,7 +201,7 @@ void SocketServer::tcpServer() {
    if (stopping_) {
      break;
    }
-    PCHECK(newsockfd >= 0) << "ERROR on accept";
+    CHECK(newsockfd >= 0) << "ERROR on accept";
    constexpr int kPeerNameLen = 128;
    char peerName[kPeerNameLen];
    CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
@@ -227,14 +227,14 @@ void SocketServer::rdmaServer() {
  /// First call to socket() function
  rdmaSocket_ = rdma::ssocket(rdmaCpu_);
-  PCHECK(rdmaSocket_) << "ERROR opening RDMA socket";
+  CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
-  PCHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
+  CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
      << "ERROR bind RDMA socket";
  /// Now start listening for the clients, here process will
  /// go in sleep mode and will wait for the incoming connection
-  PCHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
+  CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
  while (true) {
    /// Accept actual connection from the client
@@ -242,7 +242,7 @@ void SocketServer::rdmaServer() {
    if (stopping_) {
      break;
    }
-    PCHECK(newsock) << "ERROR on accept";
+    CHECK(newsock) << "ERROR on accept";
    constexpr int kPeerNameLen = 128;
    char peerName[kPeerNameLen];
@@ -290,7 +290,7 @@ RdmaClientDaemons::RdmaClientDaemons() {
    onlineCpus_ = rdma::numCpus();
    for (auto i = 0; i < onlineCpus_; i++) {
      socket = rdma::csocket(i);
-      PCHECK(socket) << "ERROR open client socket daemon";
+      CHECK(socket) << "ERROR open client socket daemon";
      rdmaClientSocket_.push_back(socket);
    }
@@ -355,7 +355,7 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
  /// Create a socket point
  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(sockfd >= 0) << "ERROR opening socket";
+  CHECK(sockfd >= 0) << "ERROR opening socket";
 #if defined(__OSX__) || defined(__APPLE__)
  server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
@@ -396,8 +396,8 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
      }
      std::this_thread::sleep_for(std::chrono::seconds(1));
    } else {
-      PCHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
+      CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
-                         << serverPort << "errorno: " << errno;
+                        << serverPort << "errorno: " << errno;
    }
  } while (errno == ECONNREFUSED);
@@ -426,7 +426,7 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
  /// connect to server with socket daemon
  sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
-  PCHECK(sock) << "ERROR connect to server" << rdmaUri;
+  CHECK(sock) << "ERROR connect to server" << rdmaUri;
  std::vector<std::string> seg;
  str::split(rdmaUri, '/', &seg);

--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -51,7 +51,7 @@ size_t SocketChannel::read(void* buf, size_t size) {
    else
      len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
-    PCHECK(len >= 0) << " peer=" << peerName_;
+    CHECK(len >= 0) << " peer=" << peerName_;
    if (len <= 0) {
      return total;
    }
@@ -69,7 +69,7 @@ size_t SocketChannel::write(const void* buf, size_t size) {
    else
      len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
-    PCHECK(len >= 0) << " peer=" << peerName_;
+    CHECK(len >= 0) << " peer=" << peerName_;
    if (len <= 0) {
      return total;
    }
@@ -98,10 +98,10 @@ static size_t readwritev(IOFunc iofunc,
  while (size < total) {
    ssize_t len =
        iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
-    PCHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
+    CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
-                    << " iovCnt=" << iovcnt
+                   << " iovCnt=" << iovcnt
-                    << " iovs[curIov].base=" << iovs[curIov].iov_base
+                   << " iovs[curIov].base=" << iovs[curIov].iov_base
-                    << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
+                   << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
    size += len;
    /// restore iovs[curIov] to the original value
@@ -183,7 +183,7 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
    header.totalLength += iov.iov_len;
  }
-  PCHECK(writev(iovs) == (size_t)header.totalLength);
+  CHECK(writev(iovs) == (size_t)header.totalLength);
 }
 std::unique_ptr<MsgReader> SocketChannel::readMessage() {
@@ -194,7 +194,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
    return nullptr;
  }
-  PCHECK(len == sizeof(header));
+  CHECK(len == sizeof(header));
  std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
@@ -209,7 +209,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
 MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
    : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
  size_t size = numBlocks * sizeof(blockLengths_[0]);
-  PCHECK(channel_->read(&blockLengths_[0], size) == size);
+  CHECK(channel_->read(&blockLengths_[0], size) == size);
 }
 void MsgReader::readBlocks(const std::vector<void*>& bufs) {
@@ -223,12 +223,12 @@ void MsgReader::readBlocks(const std::vector<void*>& bufs) {
    ++currentBlockIndex_;
  }
-  PCHECK(channel_->readv(&iovs) == totalLength);
+  CHECK(channel_->readv(&iovs) == totalLength);
 }
 void MsgReader::readNextBlock(void* buf) {
  CHECK_LT(currentBlockIndex_, blockLengths_.size());
-  PCHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
+  CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
  ++currentBlockIndex_;
 }

--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -113,7 +113,7 @@ void SocketServer::run() {
  /* First call to socket() function */
  socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(socket_ >= 0) << "ERROR opening socket";
+  CHECK(socket_ >= 0) << "ERROR opening socket";
  /* Initialize socket structure */
  bzero((char*)&serv_addr, sizeof(serv_addr));
@@ -122,7 +122,7 @@ void SocketServer::run() {
  serv_addr.sin_port = htons(port_);
  /* Now bind the host address using bind() call.*/
-  PCHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+  CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
      << "ERROR on binding";
  /* Now start listening for the clients, here process will
@@ -134,7 +134,7 @@ void SocketServer::run() {
  while (true) {
    /* Accept actual connection from the client */
    newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
-    PCHECK(newsockfd >= 0) << "ERROR on accept";
+    CHECK(newsockfd >= 0) << "ERROR on accept";
    SocketWorker* worker = new SocketWorker(newsockfd);
    worker->start();
@@ -146,17 +146,17 @@ void SocketWorker::run() {
  while (true) {
    int64_t n = channel_.readAll(&header, sizeof(header));
-    PCHECK(n == sizeof(header)) << "ERROR reading from socket";
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
    buffer_.resize(header.dataLength);
    n = channel_.readAll(&buffer_[0], header.dataLength);
-    PCHECK(n == header.dataLength) << "ERROR reading from socket";
+    CHECK(n == header.dataLength) << "ERROR reading from socket";
    /* Write a response to the client */
    n = channel_.writeAll(&header, sizeof(header));
-    PCHECK(n == sizeof(header)) << "ERROR reading from socket";
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
    n = channel_.writeAll(buffer_.data(), buffer_.size());
-    PCHECK(n == header.dataLength) << "ERROR writing to socket";
+    CHECK(n == header.dataLength) << "ERROR writing to socket";
  }
 }
@@ -177,9 +177,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
  /* Create a socket point */
  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(sockfd >= 0) << "ERROR opening socket";
+  CHECK(sockfd >= 0) << "ERROR opening socket";
  server = gethostbyname(serverAddr.c_str());
-  PCHECK(server) << "ERROR, no such host: " << serverAddr;
+  CHECK(server) << "ERROR, no such host: " << serverAddr;
  bzero((char*)&serv_addr, sizeof(serv_addr));
  serv_addr.sin_family = AF_INET;
@@ -189,7 +189,7 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
  serv_addr.sin_port = htons(serverPort);
  /* Now connect to the server */
-  PCHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+  CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
      << "ERROR connecting";
  channel_.reset(new SocketChannel(sockfd));
@@ -234,18 +234,18 @@ int main(int argc, char** argv) {
    cpuGrad.copyFrom(gpuGrad);
    header.dataLength = dataSize;
-    PCHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
+    CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
        << "Client write header error";
-    PCHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
+    CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
        << "Client write data error";
    /* Now read server response */
-    PCHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
+    CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
        << "Client read header error";
    CHECK_EQ((uint64_t)header.dataLength, dataSize);
-    PCHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
+    CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
        << "Client read data error";
    gpuParam.copyFrom(cpuParam);

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -78,7 +78,7 @@ paddle version
 # PaddlePaddle.  This awkwardness is due to
 # https://github.com/PaddlePaddle/Paddle/issues/1854.  It also
 # describes a solution.
-if [ ${WITH_DOC} == "ON" ]; then
+if [[ ${WITH_DOC} == "ON" ]]; then
    cat <<EOF
 ========================================
 Building documentation ...

--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -5,13 +5,14 @@ set -e
 mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build
-# Compile Documentation only.
+# Compile paddle binaries first
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF
 mkdir output
 make -j `nproc`
 find .. -name '*whl' | xargs pip install  # install all wheels.
 rm -rf *
+# Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
 make -j `nproc` paddle_docs paddle_docs_cn
@@ -25,7 +26,7 @@ SSH_REPO=${REPO/https:\/\/github.com\//git@github.com:}
 SHA=`git rev-parse --verify HEAD`
 # Documentation branch name
-# gh-pages branch is used for PaddlePaddle.org. The English version of 
+# gh-pages branch is used for PaddlePaddle.org. The English version of
 # documentation in `doc` directory, and the chinese version in `doc_cn`
 # directory.
 TARGET_BRANCH="gh-pages"
@@ -51,7 +52,7 @@ function deploy_docs() {
  # checkout github page branch
  git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
  mkdir -p ${DIR}
  # remove old docs. mv new docs.
  set +e
@@ -62,7 +63,7 @@ function deploy_docs() {
  git add .
 }
-deploy_docs "master" "." 
+deploy_docs "master" "."
 deploy_docs "develop" "./develop/"
 # Check is there anything changed.

--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -175,7 +175,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch,
    }
    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
    FILE* fp = fopen(featFile.c_str(), "ab+");
-    PCHECK(!ferror(fp)) << "Fail to open " << featFile;
+    CHECK(!ferror(fp)) << "Fail to open " << featFile;
    size_t sampleNum = featMatrices[0]->getHeight();
    for (size_t i = 0; i < sampleNum; ++i) {

--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -51,7 +51,7 @@ template <class T>
 class ThreadLocal {
 public:
  ThreadLocal() {
-    PCHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
+    CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
  }
  ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
@@ -65,7 +65,7 @@ public:
    if (!p && createLocal) {
      p = new T();
      int ret = pthread_setspecific(threadSpecificKey_, p);
-      PCHECK(ret == 0);
+      CHECK(ret == 0);
    }
    return p;
  }
@@ -79,7 +79,7 @@ public:
    if (T* q = get(false)) {
      dataDestructor(q);
    }
-    PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
  }
  /**
@@ -112,7 +112,7 @@ private:
 template <class T>
 class ThreadLocalD {
 public:
-  ThreadLocalD() { PCHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
+  ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
  ~ThreadLocalD() {
    pthread_key_delete(threadSpecificKey_);
    for (auto t : threadMap_) {
@@ -127,7 +127,7 @@ public:
    T* p = (T*)pthread_getspecific(threadSpecificKey_);
    if (!p) {
      p = new T();
-      PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+      CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
      updateMap(p);
    }
    return p;
@@ -141,7 +141,7 @@ public:
    if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
      dataDestructor(q);
    }
-    PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
    updateMap(p);
  }

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -29,7 +29,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
    COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 add_custom_target(paddle_python ALL DEPENDS
    ${OUTPUT_DIR}/.timestamp)
@@ -43,6 +43,7 @@ if (WITH_TESTING)
    add_subdirectory(paddle/v2/tests)
    add_subdirectory(paddle/v2/reader/tests)
    add_subdirectory(paddle/v2/plot/tests)
+    add_subdirectory(paddle/v2/framework/tests)
  endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 from __future__ import print_function
-import pdb
 '''
 The following functions are available in the config file:
@@ -762,8 +761,8 @@ class DotMulOperator(Operator):
    def check_dims(self):
        for i in range(2):
-            config_assert(self.operator_conf.input_sizes[
+            config_assert(self.operator_conf.input_sizes[i] ==
-                i] == self.operator_conf.output_size,
+                          self.operator_conf.output_size,
                          "DotMul input_size != output_size")
    def calc_output_size(self, input_sizes):
@@ -1194,7 +1193,8 @@ def parse_image(image, input_layer_name, image_conf):
 def parse_norm(norm, input_layer_name, norm_conf):
    norm_conf.norm_type = norm.norm_type
    config_assert(
-        norm.norm_type in ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
+        norm.norm_type in
+        ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
        "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]"
        % norm.norm_type)
    norm_conf.channels = norm.channels
@@ -2475,10 +2475,14 @@ class MaxLayer(LayerBase):
                 trans_type='non-seq',
                 bias=False,
                 output_max_index=None,
+                 stride=-1,
                 **xargs):
        super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
        config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
+        if trans_type == 'seq':
+            config_assert(stride == -1, 'subseq does not support stride window')
        self.config.trans_type = trans_type
+        self.config.seq_pool_stride = stride
        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)
            self.set_layer_size(input_layer.size)
@@ -2740,11 +2744,15 @@ class AverageLayer(LayerBase):
                 average_strategy='average',
                 trans_type='non-seq',
                 bias=False,
+                 stride=-1,
                 **xargs):
        super(AverageLayer, self).__init__(
            name, 'average', 0, inputs=inputs, **xargs)
        self.config.average_strategy = average_strategy
+        if trans_type == 'seq':
+            config_assert(stride == -1, 'subseq does not support stride window')
        self.config.trans_type = trans_type
+        self.config.seq_pool_stride = stride
        config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)
@@ -3434,8 +3442,7 @@ DEFAULT_SETTING = dict(
 settings = copy.deepcopy(DEFAULT_SETTING)
-settings_deprecated = dict(
+settings_deprecated = dict(usage_ratio=1., )
-    usage_ratio=1., )
 trainer_settings = dict(
    save_dir="./output/model",

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1246,10 +1246,19 @@ def pooling_layer(input,
                  name=None,
                  bias_attr=None,
                  agg_level=AggregateLevel.TO_NO_SEQUENCE,
+                  stride=-1,
                  layer_attr=None):
    """
    Pooling layer for sequence inputs, not used for Image.
+    If stride > 0, this layer slides a window whose size is determined by stride,
+    and return the pooling value of the window as the output. Thus, a long sequence
+    will be shorten. 
+    The parameter stride specifies the intervals at which to apply the pooling 
+    operation. Note that for sequence with sub-sequence, the default value
+    of stride is -1.
    The example usage is:
    .. code-block:: python
@@ -1268,6 +1277,8 @@ def pooling_layer(input,
    :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
                         SumPooling, SquareRootNPooling.
    :type pooling_type: BasePoolingType|None
+    :param stride: The step size between successive pooling regions.
+    :type stride: Int
    :param bias_attr: Bias parameter attribute. False if no bias.
    :type bias_attr: ParameterAttribute|None|False
    :param layer_attr: The Extra Attributes for layer, such as dropout.
@@ -1285,12 +1296,16 @@ def pooling_layer(input,
        extra_dict['output_max_index'] = pooling_type.output_max_index
    extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr))
+    if agg_level == AggregateLevel.TO_SEQUENCE:
+        assert stride == -1
    Layer(
        name=name,
        type=pooling_type.name,
        inputs=[Input(input.name)],
        bias=ParamAttr.to_bias(bias_attr),
        trans_type=agg_level,
+        stride=stride,
        **extra_dict)
    return LayerOutput(
@@ -1552,7 +1567,7 @@ def last_seq(input,
    :type name: basestring
    :param input: Input layer name.
    :type input: LayerOutput
-    :param stride: window size.
+    :param stride: The step size between successive pooling regions.
    :type stride: Int
    :param layer_attr: extra layer attributes.
    :type layer_attr: ExtraLayerAttribute.
@@ -1608,7 +1623,7 @@ def first_seq(input,
    :type name: basestring
    :param input: Input layer name.
    :type input: LayerOutput
-    :param stride: window size.
+    :param stride: The step size between successive pooling regions.
    :type stride: Int
    :param layer_attr: extra layer attributes.
    :type layer_attr: ExtraLayerAttribute.

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
@@ -14,6 +14,7 @@ layers {
    input_layer_name: "dat_in"
  }
  trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_1__"
@@ -24,6 +25,7 @@ layers {
    input_layer_name: "dat_in"
  }
  trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_2__"
@@ -35,6 +37,7 @@ layers {
  }
  average_strategy: "average"
  trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_3__"
@@ -46,6 +49,7 @@ layers {
  }
  average_strategy: "average"
  trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_4__"
@@ -57,6 +61,7 @@ layers {
  }
  average_strategy: "sum"
  trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_5__"
@@ -68,6 +73,7 @@ layers {
  }
  average_strategy: "sum"
  trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
  name: "__seq_pooling_6__"
@@ -77,8 +83,44 @@ layers {
  inputs {
    input_layer_name: "dat_in"
  }
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_7__"
+  type: "average"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "average"
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_8__"
+  type: "average"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "sum"
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_9__"
+  type: "max"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
  output_max_index: true
  trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 input_layer_names: "dat_in"
 output_layer_names: "__seq_pooling_0__"
@@ -88,6 +130,9 @@ output_layer_names: "__seq_pooling_3__"
 output_layer_names: "__seq_pooling_4__"
 output_layer_names: "__seq_pooling_5__"
 output_layer_names: "__seq_pooling_6__"
+output_layer_names: "__seq_pooling_7__"
+output_layer_names: "__seq_pooling_8__"
+output_layer_names: "__seq_pooling_9__"
 sub_models {
  name: "root"
  layer_names: "dat_in"
@@ -98,6 +143,9 @@ sub_models {
  layer_names: "__seq_pooling_4__"
  layer_names: "__seq_pooling_5__"
  layer_names: "__seq_pooling_6__"
+  layer_names: "__seq_pooling_7__"
+  layer_names: "__seq_pooling_8__"
+  layer_names: "__seq_pooling_9__"
  input_layer_names: "dat_in"
  output_layer_names: "__seq_pooling_0__"
  output_layer_names: "__seq_pooling_1__"
@@ -106,6 +154,9 @@ sub_models {
  output_layer_names: "__seq_pooling_4__"
  output_layer_names: "__seq_pooling_5__"
  output_layer_names: "__seq_pooling_6__"
+  output_layer_names: "__seq_pooling_7__"
+  output_layer_names: "__seq_pooling_8__"
+  output_layer_names: "__seq_pooling_9__"
  is_recurrent_layer_group: false
 }
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
@@ -14,6 +14,14 @@ for pt in POOL_TYPE:
    for al in AGG_LEVEL:
        opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt()))
+for pt in POOL_TYPE:
+    opts.append(
+        pooling_layer(
+            input=din,
+            agg_level=AggregateLevel.TO_NO_SEQUENCE,
+            pooling_type=pt(),
+            stride=5))
 opts.append(
    pooling_layer(
        input=din, pooling_type=MaxPooling(output_max_index=True)))

--- a/python/paddle/v2/framework/__init__.py
+++ b/python/paddle/v2/framework/__init__.py
+__all__ = ['proto']
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
+add_python_test(test_framework test_protobuf.py)
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
+import paddle.v2.framework.proto.op_proto_pb2
+import paddle.v2.framework.proto.attr_type_pb2
+import unittest
+class TestFrameworkProto(unittest.TestCase):
+    def test_all(self):
+        op_proto_lib = paddle.v2.framework.proto.op_proto_pb2
+        attr_type_lib = paddle.v2.framework.proto.attr_type_pb2
+        op_proto = op_proto_lib.OpProto()
+        ipt0 = op_proto.inputs.add()
+        ipt0.name = "a"
+        ipt0.comment = "the input of cosine op"
+        ipt1 = op_proto.inputs.add()
+        ipt1.name = "b"
+        ipt1.comment = "the other input of cosine op"
+        opt = op_proto.outputs.add()
+        opt.name = "output"
+        opt.comment = "the output of cosine op"
+        op_proto.comment = "cosine op, output = scale*cos(a, b)"
+        attr = op_proto.attrs.add()
+        attr.name = "scale"
+        attr.comment = "scale of cosine op"
+        attr.type = attr_type_lib.FLOAT
+        op_proto.type = "cos"
+        self.assertTrue(op_proto.IsInitialized())
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -9,7 +9,9 @@ packages=['paddle',
          'paddle.v2.dataset',
          'paddle.v2.reader',
          'paddle.v2.master',
-          'paddle.v2.plot']
+          'paddle.v2.plot',
+          'paddle.v2.framework',
+          'paddle.v2.framework.proto']
 setup_requires=["requests",
                "numpy",
@@ -29,6 +31,9 @@ setup(name='paddle',
      packages=packages,
      package_data={'paddle.v2.master': ['${paddle_master_LIB_NAME}'], },
      package_dir={
-          '': '${CMAKE_CURRENT_SOURCE_DIR}'
+          '': '${CMAKE_CURRENT_SOURCE_DIR}',
+          # The paddle.v2.framework.proto will be generated while compiling.
+          # So that package points to other directory.
+          'paddle.v2.framework.proto': '${CMAKE_BINARY_DIR}/paddle/framework'
      },
 )