diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d950e144ffdcc2f7ff619399f1f614d18f91146..007f1f18bb655e10c45bc7e21a299861eb204dd2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b)
+set(PADDLE_PATCH_VERSION 0b0)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
diff --git a/doc/build/docker_install.md b/doc/build/docker_install.md
index 997a755956bf2562bc1d2b73a0899cc7f6156727..3cd9d1730a22b97c208233ad4c6f9bad8038c1bf 100644
--- a/doc/build/docker_install.md
+++ b/doc/build/docker_install.md
@@ -8,12 +8,12 @@ Docker is a tool designed to make it easier to create, deploy, and run applicati
 ### PaddlePaddle Docker images
 There are six Docker images:
 
-- paddledev/paddle:latest-cpu: PaddlePaddle CPU binary image.
-- paddledev/paddle:latest-gpu: PaddlePaddle GPU binary image.
-- paddledev/paddle:latest-cpu-devel: PaddlePaddle CPU binary image plus source code.
-- paddledev/paddle:latest-gpu-devel: PaddlePaddle GPU binary image plus source code.
-- paddledev/paddle:latest-cpu-demo: PaddlePaddle CPU binary image plus source code and demo
-- paddledev/paddle:latest-gpu-demo: PaddlePaddle GPU binary image plus source code and demo
+- paddledev/paddle:cpu-latest: PaddlePaddle CPU binary image.
+- paddledev/paddle:gpu-latest: PaddlePaddle GPU binary image.
+- paddledev/paddle:cpu-devel-latest: PaddlePaddle CPU binary image plus source code.
+- paddledev/paddle:gpu-devel-latest: PaddlePaddle GPU binary image plus source code.
+- paddledev/paddle:cpu-demo-latest: PaddlePaddle CPU binary image plus source code and demo
+- paddledev/paddle:gpu-demo-latest: PaddlePaddle GPU binary image plus source code and demo
 
 Tags with latest will be replaced by a released version. 
 
@@ -23,7 +23,7 @@ You have to install Docker in your machine which has linux kernel version 3.10+
 
 You can use ```docker pull ```to download images first, or just launch a container with ```docker run```:
 ```bash
-docker run -it paddledev/paddle:lastest-cpu
+docker run -it paddledev/paddle:cpu-latest
 ```
 
 If you want to launch container with GPU support, you need to set some environment variables at the same time:
@@ -31,7 +31,7 @@ If you want to launch container with GPU support, you need to set some environme
 ```bash
 export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}"
 export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-docker run -it paddledev/paddle:latest-gpu
+docker run -it paddledev/paddle:gpu-latest
 ``` 
 
 ### Notice
diff --git a/doc/demo/imagenet_model/resnet_model.md b/doc/demo/imagenet_model/resnet_model.md
index 2e5c7f324434d896aaf0e3232f976e5ae6ed7f03..5403ab9f17d2399fee878d0f3c512cb166aba06f 100644
--- a/doc/demo/imagenet_model/resnet_model.md
+++ b/doc/demo/imagenet_model/resnet_model.md
@@ -165,7 +165,7 @@ We provide both C++ and Python interfaces to extract features. The following exa
 
 ### C++ Interface
 
-First, specify image data list in `define_py_data_sources` in the config, see example `demo/model_zoo/resnet/resnet.py`.
+First, specify image data list in `define_py_data_sources2` in the config, see example `demo/model_zoo/resnet/resnet.py`.
 
 ```
     train_list = 'train.list' if not is_test else None
diff --git a/doc/demo/rec/ml_regression.rst b/doc/demo/rec/ml_regression.rst
index 4917f873a934dc93e8618627473dbe99644b10b5..0c14e4f5bb7f815a06c0c756b1a6e6ef9099fd66 100644
--- a/doc/demo/rec/ml_regression.rst
+++ b/doc/demo/rec/ml_regression.rst
@@ -257,7 +257,7 @@ In these network, we use several api in `trainer_config_helpers
 *  Text Convolution Pooling Layer, `text_conv_pool
    <../../ui/api/trainer_config_helpers/networks.html
    #trainer_config_helpers.networks.text_conv_pool>`_
-*  Declare Python Data Sources, `define_py_data_sources
+*  Declare Python Data Sources, `define_py_data_sources2
    <../../ui/api/trainer_config_helpers/data_sources.html>`_
 
 Data Provider
diff --git a/doc/ui/predict/predict_sample.py b/doc/ui/predict/predict_sample.py
index ac16b2b48b9f77af5c281e475923310cbc2abe5d..d55d2c730dece07f068b728d0a75f34c70b817bd 100644
--- a/doc/ui/predict/predict_sample.py
+++ b/doc/ui/predict/predict_sample.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from py_paddle import swig_paddle, DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config
 
 TEST_DATA = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -89,12 +89,12 @@ TEST_DATA = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 
 
 def main():
-    conf = parse_config("./mnist_model/trainer_config.conf.norm", "")
+    conf = parse_config("./mnist_model/trainer_config.py", "")
     print conf.data_config.load_data_args
     network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
     assert isinstance(network, swig_paddle.GradientMachine)  # For code hint.
     network.loadParameters("./mnist_model/")
-    converter = DataProviderWrapperConverter(False, [DenseSlot(784)])
+    converter = DataProviderConverter([dense_vector(784)])
     inArg = converter(TEST_DATA)
     print network.forwardTest(inArg)
 
diff --git a/doc/ui/predict/swig_py_paddle_en.rst b/doc/ui/predict/swig_py_paddle_en.rst
index 9841f124e25a43bbf74e6d3828f99520287307fe..b743fc456914664168e1be6c7f18a419c38afa62 100644
--- a/doc/ui/predict/swig_py_paddle_en.rst
+++ b/doc/ui/predict/swig_py_paddle_en.rst
@@ -10,27 +10,35 @@ SWIG. The main steps of predict values in python are:
 * Predict
 
 Here is a sample python script that shows the typical prediction process for the
-MNIST classification problem.
+MNIST classification problem. A complete sample code could be found at
+:code:`src_root/doc/ui/predict/predict_sample.py`.
 
 ..  literalinclude:: ./predict_sample.py
     :language: python
-    :linenos:
+    :lines: 15-18,90-100,101-104
 
 The module that does the most of the job is py_paddle.swig_paddle, it's
 generated by SWIG and has complete documents, for more details you can use
 python's :code:`help()` function. Let's walk through the above python script:
 
-* At the beginning, initialize PaddlePaddle with command line arguments(line 90).
-* Parse the configuration file that is used in training(line 93).
-* Create a neural network at line 95 according the parsed configuration, then
-  load the trained parameters from model at line 97.
-* A utility class for data transformation is created at line 98.
+* At the beginning, use :code:`swig_paddle.initPaddle()` to initialize
+  PaddlePaddle with command line arguments, for more about command line arguments
+  see `Command Line Arguments <../cmd_argument/detail_introduction.html>`_.
+* Parse the configuration file that is used in training with :code:`parse_config()`.
+  Because data to predict with always have no label, and output of prediction work
+  normally is the output layer rather than the cost layer, so you should modify
+  the configuration file accordingly before using it in the prediction work.
+* Create a neural network with
+  :code:`swig_paddle.GradientMachine.createFromConfigproto()`, which takes the
+  parsed configuration :code:`conf.model_config` as argument. Then load the
+  trained parameters from the model with :code:`network.loadParameters()`.
+* Create a data converter object of utility class :code:`DataProviderConverter`.
     - Note: As swig_paddle can only accept C++ matrices, we offer a utility
-      class DataProviderWraaperConverter that can accept the same input data with
-      PyDataProviderWrapper, for more information please refer to document
+      class DataProviderConverter that can accept the same input data with
+      PyDataProvider2, for more information please refer to document
       of `PyDataProvider2 <../data_provider/pydataprovider2.html>`_.
-* Do the prediction and output the result at line 100, forwardTest is another
-  utility class that directly takes the activations of the output layer.
+* Do the prediction with :code:`forwardTest()`, which takes the converted
+  input data and outputs the activations of the output layer.
 
 Here is a typical output:
 
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c6fa7dc2b16e1c215ef5d16b13307983869641c4..cae0f64400a7e618bffb4f7fc6a044011baf04d4 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -7,6 +7,9 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+    ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
+
 if(WITH_PREDICT_SDK)
     add_subdirectory(predict)
 endif()
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 15799919fa137ca0e9021978a100c5fd25d23b75..fc003b7d6377d199c3859aa8b257de07478992af 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -266,25 +266,21 @@ template<int blockSize>
 __global__ void KeMatrixClassificationError(real* in_A,
                                             int* in_B,
                                             real* out_C,
-                                            int dimM,
                                             int dimN) {
   __shared__ real max_s[blockSize];
   __shared__ int max_l[blockSize];
-  int cnt = (dimN + blockSize -1) / blockSize;
-  int tid = threadIdx.x;
-  int lmt = tid;
-  int index = 0;
-  real t;
+  const int tid = threadIdx.x;
+  const int rowId = blockIdx.x;
 
   max_s[tid] = -1e30f;
-  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
-    index = blockIdx.y*dimN + lmt;
-    t = in_A[index];
-    if (max_s[tid] < t) {
-      max_s[tid] = t;
-      max_l[tid] = lmt;
+  in_A += rowId * dimN;
+  real tmp;
+  for (int colId = tid; colId < dimN; colId += blockSize) {
+    tmp = in_A[colId];
+    if (max_s[tid] < tmp) {
+      max_s[tid] = tmp;
+      max_l[tid] = colId;
     }
-    lmt += blockSize;
   }
   __syncthreads();
 
@@ -300,7 +296,7 @@ __global__ void KeMatrixClassificationError(real* in_A,
   __syncthreads();
 
   if (tid == 0) {
-    out_C[blockIdx.y] = (max_l[0] == in_B[blockIdx.y] ? 0 : 1.0f);
+    out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
   }
 }
 
@@ -313,12 +309,9 @@ void hl_matrix_classification_error(real* A_d,
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
 
-  int blocksX = 1;
-  int blocksY = dimM;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixClassificationError<1024><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, B_d, C_d, dimM, dimN);
+  // each sample is calculated by one block
+  KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
+    (A_d, B_d, C_d, dimN);
   CHECK_SYNC("hl_matrix_classification_error");
 }
 
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index df8a2b03142b850f0272a37175190c2af73f4d83..fb0a0ddb3d45bac5339b6eb4a11ba3c01d0bd97f 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -47,81 +47,40 @@ bool CRFLayer::init(const LayerMap& layerMap,
   // We don't need sequenceStartPositions because each sample of output_ is
   // for the cost of one sequence.
   setNeedSequenceInfo(false);
-  if (useGpu_) {
-    tmpCpuInput_.reserve(inputLayers_.size());
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_.push_back(Argument());
-    }
-  }
+
   return true;
 }
 
 void CRFLayer::forward(PassType passType) {
   Layer::forward(passType);
-  if (useGpu_) {
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(getInput(i), false, HPPL_STREAM_1);
-    }
-    VectorPtr cpuParameterValue;
-    VectorPtr cpuParameterGradient;
-    cpuParameterValue =
-      Vector::create(parameter_->getBuf(PARAMETER_VALUE)->getSize(), false);
-    cpuParameterValue->
-      copyFrom(*parameter_->getBuf(PARAMETER_VALUE), HPPL_STREAM_1);
-    if (parameter_->getBuf(PARAMETER_GRADIENT)) {
-      cpuParameterGradient =
-        Vector::create(parameter_->getBuf(PARAMETER_GRADIENT)->getSize(),
-                       false);
-      cpuParameterGradient->
-        copyFrom(*parameter_->getBuf(PARAMETER_GRADIENT), HPPL_STREAM_1);
-    } else {
-      cpuParameterGradient = nullptr;
-    }
-    forwardImp(tmpCpuInput_[0], tmpCpuInput_[1], cpuParameterValue,
-               cpuParameterGradient);
-    parameter_->getBuf(PARAMETER_VALUE)->copyFrom(*cpuParameterValue,
-                                                  HPPL_STREAM_1);
-    if (parameter_->getBuf(PARAMETER_GRADIENT)) {
-      parameter_->getBuf(PARAMETER_GRADIENT)->copyFrom(*cpuParameterGradient,
-                                                    HPPL_STREAM_1);
-    }
-  } else {
-    forwardImp(getInput(0), getInput(1), parameter_->getBuf(PARAMETER_VALUE),
-               parameter_->getBuf(PARAMETER_GRADIENT));
-  }
-}
 
-void CRFLayer::forwardImp(const Argument&output,
-                          const Argument& label,
-                          VectorPtr parameterValue,
-                          VectorPtr parameterGradient) {
+  CHECK(!useGpu_) << "GPU is not supported";
+
+  const Argument& output = getInput(0);
+  const Argument& label = getInput(1);
   CHECK(label.sequenceStartPositions);
   CHECK(label.ids);
 
   int batchSize = output.getBatchSize();
   size_t numSequences = label.sequenceStartPositions->getSize() - 1;
   resizeOutput(numSequences, 1);
-  std::vector<real> out(numSequences);
 
   const int* starts = label.sequenceStartPositions->getData(false);
   CHECK_EQ(starts[numSequences], batchSize);
-  VectorPtr cpuParameterValue;
-  VectorPtr cpuParameterGradient;
-
 
   for (size_t i = 0; i < numSequences; ++i) {
     if (i >= crfs_.size()) {
       crfs_.emplace_back(numClasses_,
-                         parameterValue->getData(),
-                         parameterGradient
-                            ? parameterGradient->getData()
+                         parameter_->getBuf(PARAMETER_VALUE)->getData(),
+                         parameter_->getBuf(PARAMETER_GRADIENT)
+                            ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
                             : nullptr);
     }
-    out[i] = crfs_[i].forward(
+    output_.value->getData()[i] = crfs_[i].forward(
         output.value->getData() + numClasses_ * starts[i],
         label.ids->getData() + starts[i], starts[i + 1] - starts[i]);
   }
-  output_.value->copyFrom(out.data(), numSequences);
+
   if (weightLayer_) {
     const MatrixPtr& weight = getInputValue(*weightLayer_);
     getOutputValue()->dotMul(*getOutputValue(), *weight);
@@ -129,22 +88,8 @@ void CRFLayer::forwardImp(const Argument&output,
 }
 
 void CRFLayer::backward(const UpdateCallback &callback) {
-  (void)callback;
-  if (useGpu_) {
-    backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]);
-    const_cast<Argument&>(getInput(0)).
-            resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_1);
-    const_cast<Argument&>(getInput(1)).
-            resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_1);
-
-  } else {
-    backwardImp(callback, getInput(0), getInput(1));
-  }
-}
-
-void CRFLayer::backwardImp(const UpdateCallback& callback,
-                           const Argument&output,
-                           const Argument& label) {
+  const Argument& output = getInput(0);
+  const Argument& label = getInput(1);
   const int* starts = label.sequenceStartPositions->getData(false);
   int numSequences = label.sequenceStartPositions->getSize() - 1;
 
@@ -159,9 +104,11 @@ void CRFLayer::backwardImp(const UpdateCallback& callback,
       grad->mulScalar(weight);
     }
   }
+
   if (coeff_ != real(1.0f)) {
     output.grad->mulScalar(coeff_);
   }
+
   parameter_->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index 5facb9b54818c261c139cc55ae88834cb639bf8d..c6ba8e7c965a3957b022319bdd7f6c4c012153c7 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -32,11 +32,7 @@ public:
   explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
   virtual void forward(PassType passType);
-  void forwardImp(const Argument&output, const Argument& label,
-                  VectorPtr parameterValue, VectorPtr parameterGradient);
   virtual void backward(const UpdateCallback& callback);
-  void backwardImp(const UpdateCallback& callback, const Argument&output,
-                   const Argument& label);
 
 protected:
   size_t numClasses_;
@@ -44,7 +40,6 @@ protected:
   std::vector<LinearChainCRF> crfs_;
   LayerPtr weightLayer_;  // weight for each sequence
   real coeff_;  // weight for the layer
-  std::vector<Argument> tmpCpuInput_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
index dd2b90dd4986cf049ae4514d308fbbf549371f90..e4727e472d446b48e6001968841bfc178e34ec0c 100644
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -18,27 +18,33 @@
 import os
 import sys
 
-from paddle.trainer.PyDataProviderWrapper import *
+from paddle.trainer.PyDataProvider2 import *
 
-@init_hook_wrapper
-def hook(obj, dict_file, **kwargs):
-    obj.word_dict = dict_file
-    obj.slots = [IndexSlot(len(obj.word_dict)), IndexSlot(3)]
-    obj.logger.info('dict len : %d' % (len(obj.word_dict)))
+def hook(settings, dict_file, **kwargs):
+    settings.word_dict = dict_file
+    settings.input_types = [integer_value_sequence(len(settings.word_dict)), 
+                            integer_value_sequence(3)]
+    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
 
-@provider(use_seq=True, init_hook=hook)
-def process(obj, file_name):
+@provider(init_hook=hook)
+def process(settings, file_name):
     with open(file_name, 'r') as fdata:
         for line in fdata:
             label, comment = line.strip().split('\t')
             label = int(''.join(label.split()))
             words = comment.split()
-            word_slot = [obj.word_dict[w] for w in words if w in obj.word_dict]
+            word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
             yield word_slot, [label]
 
 ## for hierarchical sequence network
-@provider(use_seq=True, init_hook=hook)
-def process2(obj, file_name):
+def hook2(settings, dict_file, **kwargs):
+    settings.word_dict = dict_file
+    settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
+                            integer_value_sub_sequence(3)]
+    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
+
+@provider(init_hook=hook2)
+def process2(settings, file_name):
     with open(file_name) as fdata:
         label_list = []
         word_slot_list = []
@@ -47,7 +53,7 @@ def process2(obj, file_name):
                 label,comment = line.strip().split('\t')
                 label = int(''.join(label.split()))
                 words = comment.split()
-                word_slot = [obj.word_dict[w] for w in words if w in obj.word_dict]
+                word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
                 label_list.append([label])
                 word_slot_list.append(word_slot)
             else:
diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf
index 9ad2b3762845faf9e5deac2362b971e0abcbd9a7..ac031b31280df297246c1ea2e279fc2c595bd8b7 100644
--- a/paddle/gserver/tests/sequence_layer_group.conf
+++ b/paddle/gserver/tests/sequence_layer_group.conf
@@ -21,11 +21,11 @@ dict_file = dict()
 for line_count, line in enumerate(open(dict_path, "r")):
     dict_file[line.strip()] = line_count
 
-define_py_data_sources(train_list='gserver/tests/Sequence/train.list',
-                       test_list=None,
-                       module='sequenceGen',
-                       obj='process',
-                       args={"dict_file":dict_file})
+define_py_data_sources2(train_list='gserver/tests/Sequence/train.list',
+                        test_list=None,
+                        module='sequenceGen',
+                        obj='process',
+                        args={"dict_file":dict_file})
 
 settings(batch_size=5)
 ######################## network configure ################################
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
index 8c3a08f16cd1cc593ffe48839bddd12852a41931..38c60b657b969f9fbcf46a00c542fa100da5a877 100644
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ b/paddle/gserver/tests/sequence_nest_layer_group.conf
@@ -21,11 +21,11 @@ dict_file = dict()
 for line_count, line in enumerate(open(dict_path, "r")):
     dict_file[line.strip()] = line_count
 
-define_py_data_sources(train_list='gserver/tests/Sequence/train.list.nest',
-                       test_list=None,
-                       module='sequenceGen',
-                       obj='process2',
-                       args={"dict_file":dict_file})
+define_py_data_sources2(train_list='gserver/tests/Sequence/train.list.nest',
+                        test_list=None,
+                        module='sequenceGen',
+                        obj='process2',
+                        args={"dict_file":dict_file})
 
 settings(batch_size=2)
 ######################## network configure ################################
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 7bb79ff5b702a4e3f0c2af44eb81fe4416cdcb43..5c80eb546cfafb223454487ebf198394cc3a96c5 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -179,10 +179,9 @@ TEST(Layer, CRFLayer) {
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "crf", 100, /* trans */ false, /* useGpu */ useGpu,
-                  false /*useWeight*/, 0.03 /*epsilon*/);
-  }
+  // Not support GPU now
+  testLayerGrad(config, "crf", 100, /* trans */ false, /* useGpu */ false,
+                false /*useWeight*/, 0.03 /*epsilon*/);
 }
 
 TEST(Layer, CTCLayer) {
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 7caade444b827f132daa5606c4829c79e3da98bc..fe8eacc2efbc5446e63ed40f914b62c70189010e 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1697,7 +1697,6 @@ TEST(Matrix, cosSimDerivate) {
   }
 }
 
-
 void testParamReluForward(int height, int width, int w_height,
                                                  int w_width) {
   MatrixPtr output = CpuMatrix::create(height, width, false, false);
@@ -1736,7 +1735,6 @@ TEST(Matrix, paramReluForward) {
   }
 }
 
-
 void testParamReluBackwardW(int height, int width, int w_height,
                                                    int w_width) {
   MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
@@ -1775,7 +1773,6 @@ TEST(Matrix, paramReluBackwardW) {
   }
 }
 
-
 void testParamReluBackwardDiff(int height, int width, int w_height,
                                                       int w_width) {
   MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
@@ -1819,6 +1816,36 @@ TEST(Matrix, paramReluBackwardDiff) {
   }
 }
 
+void testClassificationError(int numSamples, int dim) {
+  MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
+  MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
+  IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
+  IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
+
+  cpuOutput->randomizeUniform();
+  cpuLabel->rand(dim);
+  gpuOutput->copyFrom(*cpuOutput);
+  gpuLabel->copyFrom(*cpuLabel);
+
+  cpuError->classificationError(cpuOutput, cpuLabel);
+  gpuError->classificationError(gpuOutput, gpuLabel);
+
+  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, 1);
+  check->copyFrom(*gpuError);
+  MatrixCheckEqual(*cpuError, *check);
+}
+
+TEST(Matrix, classificationError) {
+  for (auto numSamples : {1, 10, 100, 1000, 70000}) {
+    for (auto dim : {1, 10, 100, 1000}) {
+      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
+      testClassificationError(numSamples, dim);
+    }
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 93f86ceccffc538922230f7312de11d67dd53c44..8610a66452358e1b2e2a846ddfcf62a0ce99e22e 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -269,6 +269,9 @@ void Argument::concat(const std::vector<Argument>& args,
                       const std::vector<int>& selectRows,
                       const std::vector<int>& seqStartPos, bool useGpu,
                       hl_stream_t stream, PassType passType) {
+  CHECK(!subSequenceStartPositions)
+          << "undefined behavior for subsequence positions";
+
   size_t batchSize = selectRows.size();
   auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
                                      int startRow, int pos, int size,
@@ -347,9 +350,11 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
                       hl_stream_t stream, PassType passType) {
   int32_t batchSize = 0;
   int64_t numSequences = 0;
+  int64_t numSubSequences = 0;
   for (auto& arg : args) {
     batchSize += arg.getBatchSize();
     numSequences += arg.getNumSequences();
+    numSubSequences += arg.getNumSubSequences();
   }
 
   auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src,
@@ -393,8 +398,26 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     std::copy(src->begin(), src->end(), dst->begin() + startRow);
   };
 
+  auto copySequencePos = []
+          (ICpuGpuVectorPtr& dstSeq, const ICpuGpuVectorPtr& srcSeq,
+           int dstNumSequences, int srcNumSequences,
+           int& startSequences, int startRow) {
+      if (srcSeq) {
+          ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
+          const int* src = srcSeq->getData(false);
+          int* dest = dstSeq->getMutableData(false);
+          for (int i = 0; i < srcNumSequences + 1; ++i) {
+              dest[i + startSequences] = src[i] + startRow;
+          }
+          startSequences += srcNumSequences;
+      } else {
+          dstSeq.reset();
+      }
+  };
+
   int startRow = 0;
   int startSequences = 0;
+  int startSubSequences = 0;
   dataId = args[0].dataId;
   for (auto& arg : args) {
     CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
@@ -403,17 +426,18 @@ void Argument::concat(const std::vector<Argument>& args, bool useGpu,
     copyArg(value, arg.value, startRow, useGpu);
     if (passType != PASS_TEST) copyArg(grad, arg.grad, startRow, useGpu);
     copyIds(ids, arg.ids, startRow, useGpu);
-    if (arg.sequenceStartPositions) {
-      ICpuGpuVector::resizeOrCreate(sequenceStartPositions,
-                                     numSequences + 1,
-                                     false);
-      const int* src = arg.sequenceStartPositions->getData(false);
-      int* dest = sequenceStartPositions->getMutableData(false);
-      for (int i = 0; i < arg.getNumSequences() + 1; ++i) {
-        dest[i + startSequences] = src[i] + startRow;
-      }
-      startSequences += arg.getNumSequences();
-    }
+    copySequencePos(sequenceStartPositions,
+                    arg.sequenceStartPositions,
+                    numSequences,
+                    arg.getNumSequences(),
+                    startSequences,
+                    startRow);
+    copySequencePos(subSequenceStartPositions,
+                    arg.subSequenceStartPositions,
+                    numSubSequences,
+                    arg.getNumSubSequences(),
+                    startSubSequences,
+                    startRow);
     copyStrs(strs, arg.strs, startRow, useGpu);
     startRow += arg.getBatchSize();
   }
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index 07961cbdcc20ccfbe31ff6ff3cbdd532f3d62a8e..d0e5352c828d197d6854ef19e6310dc63913846d 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -278,7 +278,7 @@ void ParameterClient2::prepareSendData(
 
           if (sendingPara) {
             sendJob->parallelInputIovs[serverId].push_back(
-                {sendMat->getLocalRow(row), sizeof(real) * blockSize});
+                {sendMat->getLocalRow(row), sizeof(real) * (size_t) blockSize});
             /// detect sparse parameter distribution
             sparseDistribution_->probeDistribution(serverId,
                     sizeof(real) * blockSize);
@@ -302,8 +302,8 @@ void ParameterClient2::prepareSendData(
         block->set_begin_pos(beginDim);
         block->set_block_size(endDim - beginDim);
         if (buf) {
-            sendJob->parallelInputIovs[serverId].push_back(
-                    {buf + beginDim, sizeof(real) * (endDim - beginDim)});
+            sendJob->parallelInputIovs[serverId].push_back({buf + beginDim,
+                     sizeof(real) * ((size_t) (endDim - beginDim))});
         }
       }
     }
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index bb3caeb728d1c44a10423946cecaa5a377b2be9b..8f72c1988d1676503f8ab1174d34a8ee6fe78516 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -724,7 +724,7 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block,
       << " id=" << block.para_id() << " block id=" << block.block_id();
 
   real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  outputBuffers->push_back({valueBuffer, block.block_size()});
+  outputBuffers->push_back({valueBuffer, (size_t) block.block_size()});
 }
 
 void ParameterServer2::sendBackParameter(const ParameterBlock& block,
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
index ebb4245b9a7df22c09e60237c8a162e4a1fbaa35..698473060a4c1660b93c10b40053e074d2ac81e9 100644
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -148,7 +148,8 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
   std::vector<iovec> iovs;
   iovs.reserve(userIovs.size() + 2);
   iovs.push_back({&header, sizeof(header)});
-  iovs.push_back({&iovLengths[0], sizeof(iovLengths[0]) * header.numIovs});
+  iovs.push_back({&iovLengths[0],
+       sizeof(iovLengths[0]) * (size_t) header.numIovs});
   iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
 
   header.totalLength = 0;
diff --git a/paddle/setup.py b/paddle/setup.py.in
similarity index 93%
rename from paddle/setup.py
rename to paddle/setup.py.in
index fabe2a6b4c1d5ba8f2a4912fc2317f3b935f9c31..da86eb795dc5890a5598f07e9c2bd62e503091f6 100644
--- a/paddle/setup.py
+++ b/paddle/setup.py.in
@@ -35,11 +35,11 @@ except:
   pass
 
 setup(name="py_paddle",
-  version="0.8.0b",  # TODO(yuyang18): Make this version same as CMake
+  version="@PADDLE_VERSION@",
   ext_modules=[
     Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
       ['Paddle_wrap.cxx'],
-      extra_link_args=["-Xlinker", '-start-group'] + 
+      extra_link_args=["-Xlinker", '-start-group'] +
                         extra_links + ["-Xlinker", "-end-group"]
     )
   ],
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index f47d3b08c167775c2ced4df90c9f034bf4f9cabd..d8a7a5dd4f12afc7edfbf2c5f28cbe31d7516153 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -79,7 +79,7 @@ protected:
   // The update function for after update operations, such as averager.
   void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
                       int tid, size_t numThreads, Parameter* para);
-  typedef std::function<const ParameterOptimizer::TraverseCallback&(Parameter*)>
+  typedef std::function<const ParameterOptimizer::TraverseCallback(Parameter*)>
     GetTraverseCallback;
   void traverse(GetTraverseCallback getTraverseCallback);
 };
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 53d8bb98f09e813e02fc6e053ef90e5e3602f828..b26a63e7f3c1d2fdfc5fea0a034a2f2c5238d1f0 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -262,8 +262,8 @@ def SubModelEnd(name = None):
 
 def MakeLayerNameInParentSubmodel(name):
     suffix = ""
-    for submodel in g_submodel_stack[1:]:
-        suffix = "@" + submodel.name + suffix
+    if len(g_submodel_stack) > 1:
+        suffix = "@" + g_submodel_stack[-1].name
     return name + suffix
 
 def GetLayerBaseName(name):