Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into variable_input

7430d305 · dangqingqing · 6dfba39c · a0be0ed6 · 7430d305 · 7430d305
51 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,7 @@ set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})

 include(system)

-project(paddle CXX C)
+project(paddle CXX C Go)

 find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)

--- a/Dockerfile
+++ b/Dockerfile
@@ -33,6 +33,15 @@ RUN apt-get update && \
    clang-3.8 llvm-3.8 libclang-3.8-dev && \
    apt-get clean -y

+# Install Go
+RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go.tgz && \
+    mkdir /root/gopath && \
+    rm go.tgz
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin
+
 # git credential to skip password typing
 RUN git config --global credential.helper store


--- a/Dockerfile.android
+++ b/Dockerfile.android
+FROM ubuntu:16.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+ENV HOME=/root \
+    ANDROID_NDK_HOME=/opt/android-ndk-linux \
+    ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc
+
+RUN apt-get update && \
+    apt-get install -y \
+    git python-dev python-pip python-numpy \
+    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
+    apt-get clean -y
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' && \
+    pip install -U wheel sphinx && \
+    pip install pre-commit
+
+# Android NDK
+RUN mkdir /opt/android-ndk-tmp && \
+    cd /opt/android-ndk-tmp && \
+    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
+    unzip -q android-ndk-r14b-linux-x86_64.zip && \
+    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \
+    rm -rf /opt/android-ndk-tmp && \
+    rm -rf ${ANDROID_NDK_HOME}
+
+CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
--- a/cmake/CMakeDetermineGoCompiler.cmake
+++ b/cmake/CMakeDetermineGoCompiler.cmake
+if(NOT CMAKE_Go_COMPILER)
+  if(NOT $ENV{GO_COMPILER} STREQUAL "")
+    get_filename_component(CMAKE_Go_COMPILER_INIT $ENV{GO_COMPILER} PROGRAM PROGRAM_ARGS CMAKE_Go_FLAGS_ENV_INIT)
+
+    if(CMAKE_Go_FLAGS_ENV_INIT)
+      set(CMAKE_Go_COMPILER_ARG1 "${CMAKE_Go_FLAGS_ENV_INIT}" CACHE STRING "First argument to Go compiler")
+    endif()
+
+    if(NOT EXISTS ${CMAKE_Go_COMPILER_INIT})
+      message(SEND_ERROR "Could not find compiler set in environment variable GO_COMPILER:\n$ENV{GO_COMPILER}.")
+    endif()
+
+  endif()
+
+  set(Go_BIN_PATH
+    $ENV{GOPATH}
+    $ENV{GOROOT}
+    $ENV{GOROOT}/bin
+    $ENV{GO_COMPILER}
+    /usr/bin
+    /usr/local/bin
+    )
+
+  if(CMAKE_Go_COMPILER_INIT)
+    set(CMAKE_Go_COMPILER ${CMAKE_Go_COMPILER_INIT} CACHE PATH "Go Compiler")
+  else()
+    find_program(CMAKE_Go_COMPILER
+      NAMES go
+      PATHS ${Go_BIN_PATH}
+    )
+    if(CMAKE_Go_COMPILER)
+      EXEC_PROGRAM(${CMAKE_Go_COMPILER} ARGS version OUTPUT_VARIABLE GOLANG_VERSION)
+      STRING(REGEX MATCH "go[0-9]+[.0-9]*[ /A-Za-z0-9]*" VERSION "${GOLANG_VERSION}")
+      message("-- The Golang compiler identification is ${VERSION}")
+      message("-- Check for working Golang compiler: ${CMAKE_Go_COMPILER}")
+    endif()
+  endif()
+
+endif()
+
+mark_as_advanced(CMAKE_Go_COMPILER)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/CMakeGoCompiler.cmake.in
+  ${CMAKE_PLATFORM_INFO_DIR}/CMakeGoCompiler.cmake @ONLY)
+
+set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
--- a/cmake/CMakeGoCompiler.cmake.in
+++ b/cmake/CMakeGoCompiler.cmake.in
+set(CMAKE_Go_COMPILER "@CMAKE_Go_COMPILER@")
+set(CMAKE_Go_COMPILER_LOADED 1)
+
+set(CMAKE_Go_SOURCE_FILE_EXTENSIONS go)
+set(CMAKE_Go_LINKER_PREFERENCE 40)
+set(CMAKE_Go_OUTPUT_EXTENSION .o)
+set(CMAKE_Go_OUTPUT_EXTENSION_REPLACE 1)
+set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
--- a/cmake/CMakeGoInformation.cmake
+++ b/cmake/CMakeGoInformation.cmake
+if(NOT CMAKE_Go_COMPILE_OBJECT)
+  set(CMAKE_Go_COMPILE_OBJECT "go tool compile -l -N -o <OBJECT> <SOURCE> ")
+endif()
+
+if(NOT CMAKE_Go_LINK_EXECUTABLE)
+  set(CMAKE_Go_LINK_EXECUTABLE "go tool link -o <TARGET> <OBJECTS>  ")
+endif()
--- a/cmake/CMakeTestGoCompiler.cmake
+++ b/cmake/CMakeTestGoCompiler.cmake
+set(CMAKE_Go_COMPILER_WORKS 1 CACHE INTERNAL "")
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -27,6 +27,7 @@
 #
 # cmake_parse_arguments can help us to achieve this goal.
 # https://cmake.org/cmake/help/v3.0/module/CMakeParseArguments.html
+#

 # cc_library parses tensor.cc and figures out that target also depend on tensor.h.
 # cc_library(tensor
@@ -139,3 +140,78 @@ function(nv_test TARGET_NAME)
  endif()
  add_test(${TARGET_NAME} ${TARGET_NAME})
 endfunction(nv_test)
+
+set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
+file(MAKE_DIRECTORY ${GOPATH})
+
+# Because api.go defines a GO wrapper to ops and tensor, it depends on
+# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
+# api.go is changed, api need to be re-built.
+# go_library(api
+#   SRCS
+#   api.go
+#   DEPS
+#   tensor # Because ops depend on tensor, this line is optional.
+#   ops)
+function(go_library TARGET_NAME)
+  set(options OPTIONAL)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if (${go_library_OPTIONAL} STREQUAL "SHARED")
+    set(BUILD_MODE "-buildmode=c-shared")
+    if(APPLE)
+      set(LIB_NAME "lib${TARGET_NAME}.dylib")
+    else()
+      set(LIB_NAME "lib${TARGET_NAME}.so")
+    endif()  
+  else()
+    set(BUILD_MODE "-buildmode=c-archive")
+    set(LIB_NAME "lib${TARGET_NAME}.a")
+  endif()
+  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
+    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
+    ${go_library_SRCS}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+  add_custom_target(${TARGET_NAME}_lib ALL DEPENDS ${TARGET_NAME}_timestamp ${go_library_DEPS})
+  add_library(${TARGET_NAME} STATIC IMPORTED)
+  set_property(TARGET ${TARGET_NAME} PROPERTY
+    IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}")
+  add_dependencies(${TARGET_NAME} ${TARGET_NAME}_lib)
+endfunction(go_library)
+
+function(go_binary TARGET_NAME)
+  set(options OPTIONAL)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
+    -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
+    ${go_library_SRCS}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})  
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_binary_DEPS})  
+  install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
+endfunction(go_binary)
+
+function(go_test TARGET_NAME)
+  set(options OPTIONAL)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+    -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
+    ${go_test_SRCS}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})  
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})  
+  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
+endfunction(go_test)
+
+# go_extern will download extern go project.
+# go_extern(target_name extern_source)
+# go_extern(go_redis github.com/hoisie/redis)
+function(go_extern TARGET_NAME)
+  add_custom_target(${TARGET_NAME} env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get ${ARGN})
+endfunction(go_extern)
--- a/demo/seqToseq/api_train_v2.py
+++ b/demo/seqToseq/api_train_v2.py
@@ -21,9 +21,12 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
        size=word_vector_dim,
        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
    src_forward = paddle.networks.simple_gru(
-        input=src_embedding, size=encoder_size)
+        name='src_forward_gru', input=src_embedding, size=encoder_size)
    src_backward = paddle.networks.simple_gru(
-        input=src_embedding, size=encoder_size, reverse=True)
+        name='src_backward_gru',
+        input=src_embedding,
+        size=encoder_size,
+        reverse=True)
    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])

    #### Decoder
@@ -34,7 +37,9 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
    backward_first = paddle.layer.first_seq(input=src_backward)

    with paddle.layer.mixed(
-            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
+            name="decoder_boot_mixed",
+            size=decoder_size,
+            act=paddle.activation.Tanh()) as decoder_boot:
        decoder_boot += paddle.layer.full_matrix_projection(
            input=backward_first)

@@ -44,11 +49,17 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)

        context = paddle.networks.simple_attention(
+            name="simple_attention",
            encoded_sequence=enc_vec,
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)

-        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
+        with paddle.layer.mixed(
+                name="input_recurrent",
+                size=decoder_size * 3,
+                # enable error clipping 
+                layer_attr=paddle.attr.ExtraAttr(
+                    error_clipping_threshold=100.0)) as decoder_inputs:
            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
            decoder_inputs += paddle.layer.full_matrix_projection(
                input=current_word)
@@ -57,9 +68,12 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
            name='gru_decoder',
            input=decoder_inputs,
            output_mem=decoder_mem,
+            # uncomment to enable local threshold for gradient clipping
+            # param_attr=paddle.attr.ParamAttr(gradient_clipping_threshold=9.9),
            size=decoder_size)

        with paddle.layer.mixed(
+                name="gru_step_output",
                size=target_dict_dim,
                bias_attr=True,
                act=paddle.activation.Softmax()) as out:
@@ -125,7 +139,13 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):


 def main():
-    paddle.init(use_gpu=False, trainer_count=1)
+    paddle.init(
+        use_gpu=False,
+        trainer_count=1,
+        # log gradient clipping info
+        log_clipping=True,
+        # log error clipping info
+        log_error_clipping=True)
    is_generating = False

    # source and target dict dim.
@@ -140,6 +160,8 @@ def main():
        # define optimize method and trainer
        optimizer = paddle.optimizer.Adam(
            learning_rate=5e-5,
+            # uncomment to enable global threshold for gradient clipping
+            # gradient_clipping_threshold=10.0,
            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
        trainer = paddle.trainer.SGD(cost=cost,
                                     parameters=parameters,

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -9,6 +9,10 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)

+if(CMAKE_Go_COMPILER)
+  add_subdirectory(go)
+endif()
+
 find_package(Boost QUIET)

 if(Boost_FOUND)

--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -58,10 +58,16 @@ target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR}
 link_paddle_exe(paddle_capi_shared)

 # install library & headers.
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
 install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
-install(TARGETS paddle_capi_shared DESTINATION lib)
+if(ANDROID)
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library}
+          DESTINATION lib/${ANDROID_ABI})
+  install(TARGETS paddle_capi_shared DESTINATION lib/${ANDROID_ABI})
+else(ANDROID)
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
+  install(TARGETS paddle_capi_shared DESTINATION lib)
+endif(ANDROID)

 # this variable used for unittest
 set(PADDLE_CAPI_INC_PATH

--- a/paddle/go/CMakeLists.txt
+++ b/paddle/go/CMakeLists.txt
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+go_library(adder SRCS adder.go)
+
+cc_test(cgo_test
+        SRCS
+        cgo_test.cc
+        DEPS
+        adder)
--- a/paddle/go/adder.go
+++ b/paddle/go/adder.go
+package main
+
+import "C"
+
+//export GoAdder
+func GoAdder(x, y int) int {
+	return x + y
+}
+
+func main() {} // Required but ignored
--- a/paddle/go/cclient/CMakeLists.txt
+++ b/paddle/go/cclient/CMakeLists.txt
@@ -2,12 +2,9 @@ cmake_minimum_required(VERSION 3.0)

 if(GTEST_INCLUDE_DIR AND GTEST_LIBRARIES)
  message("-- Found gtest (include: ${GTEST_INCLUDE_DIR}, library: ${GTEST_LIBRARIES})")
-else()	
-  # find #include <majel/xx.h>
-  get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-  include_directories(${PARENT_DIR})
-
+else()
  # find cmake directory modules
+  get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
  get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
  get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)


--- a/paddle/go/cclient/test/CMakeLists.txt
+++ b/paddle/go/cclient/test/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0)

-include_directories(/env/gopath/src/github.com/PaddlePaddle/Paddle/paddle/go/cclient/build/)
+include_directories(${CMAKE_BINARY_DIR})

 add_executable(main main.c)
 add_dependencies(main client)
 set (CMAKE_EXE_LINKER_FLAGS "-pthread")
-target_link_libraries(main /env/gopath/src/github.com/PaddlePaddle/Paddle/paddle/go/cclient/build/libclient.a) # ${GTEST_LIBRARIES})
+target_link_libraries(main ${CMAKE_BINARY_DIR}/libclient.a)
--- a/paddle/go/cgo_test.cc
+++ b/paddle/go/cgo_test.cc
+#include <iostream>
+#include "gtest/gtest.h"
+#include "libadder.h"
+
+TEST(Cgo, Invoke) { EXPECT_EQ(GoAdder(30, 12), 42); }
--- a/paddle/go/cmd/pserver/.gitignore
+++ b/paddle/go/cmd/pserver/.gitignore
+pserver
--- a/paddle/go/cmd/pserver/pserver.go
+++ b/paddle/go/cmd/pserver/pserver.go
+package main
+
+import (
+	"flag"
+	"net"
+	"net/http"
+	"net/rpc"
+	"strconv"
+
+	"github.com/PaddlePaddle/Paddle/paddle/go/pserver"
+)
+
+func main() {
+	port := flag.Int("p", 0, "port of the pserver")
+	flag.Parse()
+
+	s := pserver.NewService()
+	err := rpc.Register(s)
+	if err != nil {
+		panic(err)
+	}
+
+	rpc.HandleHTTP()
+	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
+	if err != nil {
+		panic(err)
+	}
+
+	err = http.Serve(l, nil)
+	if err != nil {
+		panic(err)
+	}
+}
--- a/paddle/go/pserver/client.go
+++ b/paddle/go/pserver/client.go
 package pserver

-// ElementType is the type of elements of a Parameter.
-type ElementType int
-
-// Supported element types
-const (
-	Int32 ElementType = iota
-	UInt32
-	Int64
-	UInt64
-	Float32
-	Float64
-)
-
-// Parameter is a piece of data to sync with the parameter server.
-type Parameter struct {
-	Name        string
-	ElementType ElementType
-	Content     []byte
-}
-
-// ParameterWithConfig contains the parameter and the configuration.
-type ParameterWithConfig struct {
-	Param  Parameter
-	Config []byte // parameter configuration in Proto Buffer format
-}
-
-// Gradient is the gradient of the parameter.
-type Gradient Parameter
-
 // Client is the client to parameter servers.
 type Client struct {
 }

--- a/paddle/go/pserver/optimizer.c
+++ b/paddle/go/pserver/optimizer.c
+#include <stdlib.h>
+
+#include "optimizer.h"
+
+typedef int (*update_func)(void*, void*, paddle_element_type, const void*, int);
+typedef void (*release_func)(void*);
+
+typedef struct paddle_optimizer {
+  update_func update;
+  release_func release;
+  void* optimizer;
+} paddle_optimizer;
+
+void paddle_release_optimizer(paddle_optimizer* o) {
+  o->release(o->optimizer);
+  free(o);
+}
+
+int paddle_update_parameter(paddle_optimizer* o,
+                            void* buffer,
+                            paddle_element_type element_type,
+                            const void* gradient,
+                            int num_bytes) {
+  return o->update(o->optimizer, buffer, element_type, gradient, num_bytes);
+}
+
+typedef struct { double learning_rate; } SGD_optimizer;
+
+int update_SGD(void* optimizer,
+               void* buffer,
+               paddle_element_type element_type,
+               const void* gradient,
+               int num_bytes) {
+  SGD_optimizer* o = (SGD_optimizer*)optimizer;
+  // TODO
+  return 0;
+}
+
+void release_SGD(void* optimizer) {
+  SGD_optimizer* o = (SGD_optimizer*)optimizer;
+  // nothing allocated on heap
+}
+
+paddle_optimizer* paddle_create_SGD_optimizer(double learning_rate) {
+  SGD_optimizer* impl = (SGD_optimizer*)malloc(sizeof(SGD_optimizer));
+  impl->learning_rate = learning_rate;
+  paddle_optimizer* opt = (paddle_optimizer*)malloc(sizeof(paddle_optimizer));
+  opt->update = update_SGD;
+  opt->release = release_SGD;
+  opt->optimizer = impl;
+  return opt;
+}
--- a/paddle/go/pserver/optimizer.go
+++ b/paddle/go/pserver/optimizer.go
+package pserver
+
+/*
+#include "optimizer.h"
+*/
+import "C"
+import (
+	"fmt"
+	"unsafe"
+)
+
+type optimizerType int
+
+const (
+	sgd optimizerType = iota
+)
+
+var nullPtr = unsafe.Pointer(uintptr(0))
+
+type optimizer struct {
+	opt *C.struct_paddle_optimizer
+}
+
+func newOptimizer(t optimizerType, learning_rate float64) *optimizer {
+	o := &optimizer{}
+	o.opt = C.paddle_create_SGD_optimizer(C.double(learning_rate))
+	return o
+}
+
+func (o *optimizer) UpdateParameter(p Parameter, g Gradient) error {
+	if len(p.Content) != len(g.Content) {
+		return fmt.Errorf("parameter and gradient length not match, parameter: %d, gradient: %d", len(p.Content), len(g.Content))
+	}
+
+	if p.ElementType != g.ElementType {
+		return fmt.Errorf("parameter and gradient element type not match, parameter: %v, gradient: %v", p.ElementType, g.ElementType)
+	}
+
+	r := C.paddle_update_parameter(o.opt, unsafe.Pointer(&p.Content[0]), C.paddle_element_type(p.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
+	if r != 0 {
+		return fmt.Errorf("optimizer update returned error code: %d", r)
+	}
+	return nil
+}
+
+func (o *optimizer) Cleanup() {
+	if unsafe.Pointer(o.opt) != nullPtr {
+		C.paddle_release_optimizer(o.opt)
+		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+	}
+}
--- a/paddle/go/pserver/optimizer.h
+++ b/paddle/go/pserver/optimizer.h
+#ifndef PADDLE_PSERVER_OPTIMIZER_H
+#define PADDLE_PSERVER_OPTIMIZER_H
+
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32 = 0,
+  PADDLE_ELEMENT_TYPE_UINT32 = 1,
+  PADDLE_ELEMENT_TYPE_INT64 = 2,
+  PADDLE_ELEMENT_TYPE_UINT64 = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+
+struct paddle_optimizer;
+struct paddle_optimizer* paddle_create_SGD_optimizer(double learning_rate);
+void paddle_release_optimizer(struct paddle_optimizer* o);
+int paddle_update_parameter(struct paddle_optimizer* o,
+                            void* buffer,
+                            paddle_element_type element_type,
+                            const void* gradient,
+                            int num_bytes);
+
+#endif /* PADDLE_PSERVER_OPTIMIZER_H */
--- a/paddle/go/pserver/optimizer_test.go
+++ b/paddle/go/pserver/optimizer_test.go
+package pserver
+
+import "testing"
+
+func TestSGDCreateRelease(t *testing.T) {
+	o := newOptimizer(sgd, 1)
+	o.Cleanup()
+}
--- a/paddle/go/pserver/service.go
+++ b/paddle/go/pserver/service.go
+package pserver
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+)
+
+// ElementType is the type of elements of a Parameter.
+type ElementType int
+
+var ErrAlreadyInitialized = errors.New("pserver already initialized")
+var ErrUninitialized = errors.New("pserver not fully initialized")
+
+// Supported element types
+const (
+	Int32 ElementType = iota
+	UInt32
+	Int64
+	UInt64
+	Float32
+	Float64
+)
+
+// Parameter is a piece of data to sync with the parameter server.
+type Parameter struct {
+	Name        string
+	ElementType ElementType
+	Content     []byte
+}
+
+// ParameterWithConfig contains the parameter and the configuration.
+type ParameterWithConfig struct {
+	Param  Parameter
+	Config []byte // parameter configuration in Proto Buffer format
+}
+
+// Gradient is the gradient of the parameter.
+type Gradient Parameter
+
+// Service is the RPC service for pserver.
+type Service struct {
+	initialized chan struct{}
+
+	mu       sync.Mutex
+	opt      *optimizer
+	paramMap map[string]Parameter
+}
+
+// NewService creates a new service.
+func NewService() *Service {
+	s := &Service{}
+	s.paramMap = make(map[string]Parameter)
+	s.initialized = make(chan struct{})
+	return s
+}
+
+// BeginInitParams tells the parameter server that the parameter
+// initialization has begun.
+func (s *Service) BeginInitParams(config []byte, dummy *int) error {
+	select {
+	case <-s.initialized:
+		return ErrAlreadyInitialized
+	default:
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if s.opt != nil {
+		s.opt.Cleanup()
+	}
+
+	// TODO(helin): parse learning rate from config
+	s.opt = newOptimizer(sgd, 0.01)
+	return nil
+}
+
+// InitParam initializes a parameter.
+func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
+	select {
+	case <-s.initialized:
+		return ErrAlreadyInitialized
+	default:
+	}
+
+	// TODO(helin): parse parameter config
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// TODO(helin): check if paramWithConfigs.Param.Content is
+	// properly memory aligned, if not, make copy to a memory
+	// aligned region.
+	s.paramMap[paramWithConfigs.Param.Name] = paramWithConfigs.Param
+	return nil
+}
+
+// FinishInitParams tells the parameter server that the parameter
+// initialization has finished.
+func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
+	select {
+	case <-s.initialized:
+		return ErrAlreadyInitialized
+	default:
+	}
+
+	close(s.initialized)
+	return nil
+}
+
+// SendGrads sends gradients to parameter servers for parameter
+// optimization.
+func (s *Service) SendGrads(grads []Gradient, dummy *int) error {
+	select {
+	case <-s.initialized:
+	default:
+		return ErrUninitialized
+	}
+
+	count := len(grads)
+	if count == 0 {
+		return nil
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	for _, g := range grads {
+		if _, ok := s.paramMap[g.Name]; !ok {
+			return fmt.Errorf("parameter: %s does not exist", g.Name)
+		}
+	}
+
+	errCh := make(chan error, count)
+	for _, g := range grads {
+		go func(p Parameter, g Gradient) {
+			err := s.opt.UpdateParameter(p, g)
+			errCh <- err
+		}(s.paramMap[g.Name], g)
+	}
+
+	recv := 0
+	for err := range errCh {
+		if err != nil {
+			return err
+		}
+
+		recv++
+		if recv == count {
+			break
+		}
+	}
+	return nil
+}
+
+// GetParams gets parameters from the parameter server.
+func (s *Service) GetParams(names []string, parameters *[]Parameter) error {
+	<-s.initialized
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	for _, n := range names {
+		if _, ok := s.paramMap[n]; !ok {
+			return fmt.Errorf("parameter: %s does not exist", n)
+		}
+	}
+
+	*parameters = make([]Parameter, len(names))
+	for i, n := range names {
+		// The parameter content (a byte slice) may change
+		// during RPC serialization due to write from other
+		// goroutine, we allow it since mini-batch based deep
+		// learning optimization methods are stochastic in
+		// nature. This race condition is allowed deliberately
+		// to save the program from making a copy of the
+		// paramter content.
+		(*parameters)[i] = s.paramMap[n]
+	}
+
+	return nil
+}
+
+// Save tells the parameter server to save parameters.
+func (s *Service) Save(path string, dummy *int) error {
+	<-s.initialized
+
+	// TODO
+	return nil
+}
--- a/paddle/go/pserver/service_test.go
+++ b/paddle/go/pserver/service_test.go
+package pserver_test
+
+import (
+	"reflect"
+	"sync"
+	"testing"
+
+	"github.com/PaddlePaddle/Paddle/paddle/go/pserver"
+)
+
+func TestFull(t *testing.T) {
+	s := pserver.NewService()
+	var dummy int
+	err := s.BeginInitParams(nil, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	var p pserver.Parameter
+	p.Name = "param_a"
+	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
+	p.ElementType = pserver.Int32
+	err = s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	var p1 pserver.Parameter
+	p1.Name = "param_b"
+	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+	p1.ElementType = pserver.Float32
+	err = s.InitParam(pserver.ParameterWithConfig{p1, nil}, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	err = s.FinishInitParams(0, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	var params []pserver.Parameter
+	err = s.GetParams([]string{"param_b", "param_a"}, &params)
+	if err != nil {
+		t.FailNow()
+	}
+
+	if len(params) != 2 || !reflect.DeepEqual(params[0], p1) || !reflect.DeepEqual(params[0], p1) {
+		t.FailNow()
+	}
+
+	grads := []pserver.Gradient{pserver.Gradient(p1), pserver.Gradient(p)}
+	err = s.SendGrads(grads, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	var params1 []pserver.Parameter
+	err = s.GetParams([]string{"param_b", "param_a"}, &params1)
+	if err != nil {
+		t.FailNow()
+	}
+
+	if len(params) != 2 {
+		t.FailNow()
+	}
+
+	// don't compare content, since it's already changed by
+	// gradient update.
+	params1[0].Content = nil
+	params1[0].Content = nil
+	p.Content = nil
+	p1.Content = nil
+
+	if !reflect.DeepEqual(params1[0], p1) || !reflect.DeepEqual(params1[0], p1) {
+		t.FailNow()
+	}
+}
+
+func TestMultipleInit(t *testing.T) {
+	s := pserver.NewService()
+	var dummy int
+	err := s.BeginInitParams(nil, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	// this is fine, it's possible for client to call init
+	// multiple times.
+	err = s.BeginInitParams(nil, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	err = s.FinishInitParams(0, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	err = s.FinishInitParams(0, &dummy)
+	if err != pserver.ErrAlreadyInitialized {
+		t.FailNow()
+	}
+
+	err = s.BeginInitParams(nil, &dummy)
+	if err != pserver.ErrAlreadyInitialized {
+		t.FailNow()
+	}
+}
+
+func TestUninitialized(t *testing.T) {
+	s := pserver.NewService()
+	var dummy int
+	err := s.SendGrads(nil, &dummy)
+	if err != pserver.ErrUninitialized {
+		t.FailNow()
+	}
+}
+
+func TestBlockUntilInitialized(t *testing.T) {
+	s := pserver.NewService()
+	ch := make(chan struct{}, 2)
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		var params []pserver.Parameter
+		err := s.GetParams(nil, &params)
+		if err != nil {
+			t.FailNow()
+		}
+		wg.Done()
+		ch <- struct{}{}
+	}()
+
+	wg.Add(1)
+	go func() {
+		var dummy int
+		err := s.Save("", &dummy)
+		if err != nil {
+			t.FailNow()
+		}
+		wg.Done()
+		ch <- struct{}{}
+	}()
+
+	var dummy int
+	err := s.BeginInitParams(nil, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	select {
+	case <-ch:
+		// some function returned before initialization is completed.
+		t.FailNow()
+	default:
+	}
+
+	err = s.FinishInitParams(0, &dummy)
+	if err != nil {
+		t.FailNow()
+	}
+
+	wg.Wait()
+}
--- a/paddle/go/recordio/README.md
+++ b/paddle/go/recordio/README.md
+# RecordIO
+
+## Write
+
+```go
+f, e := os.Create("a_file.recordio")
+w := recordio.NewWriter(f)
+w.Write([]byte("Hello"))
+w.Write([]byte("World!"))
+w.Close()
+```
+
+## Read
+
+1. Load chunk index:
+
+   ```go
+   f, e := os.Open("a_file.recordio")
+   idx, e := recordio.LoadIndex(f)
+   fmt.Println("Total records: ", idx.Len())
+   ```
+
+2. Create one or more scanner to read a range of records.  The
+   following example reads the range
+   [1, 3), i.e., the second and the third records:
+
+   ```go
+   f, e := os.Open("a_file.recordio")
+   s := recrodio.NewScanner(f, idx, 1, 3)
+   for s.Scan() {
+      fmt.Println(string(s.Record()))
+   }
+   if s.Err() != nil && s.Err() != io.EOF {
+      log.Fatalf("Something wrong with scanning: %v", e)
+   }
+   ```
--- a/paddle/go/recordio/chunk.go
+++ b/paddle/go/recordio/chunk.go
+package recordio
+
+import (
+	"bytes"
+	"compress/gzip"
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
+	"io"
+
+	"github.com/golang/snappy"
+)
+
+// A Chunk contains the Header and optionally compressed records.  To
+// create a chunk, just use ch := &Chunk{}.
+type Chunk struct {
+	records  [][]byte
+	numBytes int // sum of record lengths.
+}
+
+func (ch *Chunk) add(record []byte) {
+	ch.records = append(ch.records, record)
+	ch.numBytes += len(record)
+}
+
+// dump the chunk into w, and clears the chunk and makes it ready for
+// the next add invocation.
+func (ch *Chunk) dump(w io.Writer, compressorIndex int) error {
+	// NOTE: don't check ch.numBytes instead, because empty
+	// records are allowed.
+	if len(ch.records) == 0 {
+		return nil
+	}
+
+	// Write raw records and their lengths into data buffer.
+	var data bytes.Buffer
+
+	for _, r := range ch.records {
+		var rs [4]byte
+		binary.LittleEndian.PutUint32(rs[:], uint32(len(r)))
+
+		if _, e := data.Write(rs[:]); e != nil {
+			return fmt.Errorf("Failed to write record length: %v", e)
+		}
+
+		if _, e := data.Write(r); e != nil {
+			return fmt.Errorf("Failed to write record: %v", e)
+		}
+	}
+
+	compressed, e := compressData(&data, compressorIndex)
+	if e != nil {
+		return e
+	}
+
+	// Write chunk header and compressed data.
+	hdr := &Header{
+		checkSum:       crc32.ChecksumIEEE(compressed.Bytes()),
+		compressor:     uint32(compressorIndex),
+		compressedSize: uint32(compressed.Len()),
+		numRecords:     uint32(len(ch.records)),
+	}
+
+	if _, e := hdr.write(w); e != nil {
+		return fmt.Errorf("Failed to write chunk header: %v", e)
+	}
+
+	if _, e := w.Write(compressed.Bytes()); e != nil {
+		return fmt.Errorf("Failed to write chunk data: %v", e)
+	}
+
+	// Clear the current chunk.
+	ch.records = nil
+	ch.numBytes = 0
+
+	return nil
+}
+
+type noopCompressor struct {
+	*bytes.Buffer
+}
+
+func (c *noopCompressor) Close() error {
+	return nil
+}
+
+func compressData(src io.Reader, compressorIndex int) (*bytes.Buffer, error) {
+	compressed := new(bytes.Buffer)
+	var compressor io.WriteCloser
+
+	switch compressorIndex {
+	case NoCompression:
+		compressor = &noopCompressor{compressed}
+	case Snappy:
+		compressor = snappy.NewBufferedWriter(compressed)
+	case Gzip:
+		compressor = gzip.NewWriter(compressed)
+	default:
+		return nil, fmt.Errorf("Unknown compression algorithm: %d", compressorIndex)
+	}
+
+	if _, e := io.Copy(compressor, src); e != nil {
+		return nil, fmt.Errorf("Failed to compress chunk data: %v", e)
+	}
+	compressor.Close()
+
+	return compressed, nil
+}
+
+// parse the specified chunk from r.
+func parseChunk(r io.ReadSeeker, chunkOffset int64) (*Chunk, error) {
+	var e error
+	var hdr *Header
+
+	if _, e = r.Seek(chunkOffset, io.SeekStart); e != nil {
+		return nil, fmt.Errorf("Failed to seek chunk: %v", e)
+	}
+
+	hdr, e = parseHeader(r)
+	if e != nil {
+		return nil, fmt.Errorf("Failed to parse chunk header: %v", e)
+	}
+
+	var buf bytes.Buffer
+	if _, e = io.CopyN(&buf, r, int64(hdr.compressedSize)); e != nil {
+		return nil, fmt.Errorf("Failed to read chunk data: %v", e)
+	}
+
+	if hdr.checkSum != crc32.ChecksumIEEE(buf.Bytes()) {
+		return nil, fmt.Errorf("Checksum checking failed.")
+	}
+
+	deflated, e := deflateData(&buf, int(hdr.compressor))
+	if e != nil {
+		return nil, e
+	}
+
+	ch := &Chunk{}
+	for i := 0; i < int(hdr.numRecords); i++ {
+		var rs [4]byte
+		if _, e = deflated.Read(rs[:]); e != nil {
+			return nil, fmt.Errorf("Failed to read record length: %v", e)
+		}
+
+		r := make([]byte, binary.LittleEndian.Uint32(rs[:]))
+		if _, e = deflated.Read(r); e != nil {
+			return nil, fmt.Errorf("Failed to read a record: %v", e)
+		}
+
+		ch.records = append(ch.records, r)
+		ch.numBytes += len(r)
+	}
+
+	return ch, nil
+}
+
+func deflateData(src io.Reader, compressorIndex int) (*bytes.Buffer, error) {
+	var e error
+	var deflator io.Reader
+
+	switch compressorIndex {
+	case NoCompression:
+		deflator = src
+	case Snappy:
+		deflator = snappy.NewReader(src)
+	case Gzip:
+		deflator, e = gzip.NewReader(src)
+		if e != nil {
+			return nil, fmt.Errorf("Failed to create gzip reader: %v", e)
+		}
+	default:
+		return nil, fmt.Errorf("Unknown compression algorithm: %d", compressorIndex)
+	}
+
+	deflated := new(bytes.Buffer)
+	if _, e = io.Copy(deflated, deflator); e != nil {
+		return nil, fmt.Errorf("Failed to deflate chunk data: %v", e)
+	}
+
+	return deflated, nil
+}
--- a/paddle/go/recordio/header.go
+++ b/paddle/go/recordio/header.go
+package recordio
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+)
+
+const (
+	// NoCompression means writing raw chunk data into files.
+	// With other choices, chunks are compressed before written.
+	NoCompression = iota
+	// Snappy had been the default compressing algorithm widely
+	// used in Google.  It compromises between speech and
+	// compression ratio.
+	Snappy
+	// Gzip is a well-known compression algorithm.  It is
+	// recommmended only you are looking for compression ratio.
+	Gzip
+
+	magicNumber       uint32 = 0x01020304
+	defaultCompressor        = Snappy
+)
+
+// Header is the metadata of Chunk.
+type Header struct {
+	checkSum       uint32
+	compressor     uint32
+	compressedSize uint32
+	numRecords     uint32
+}
+
+func (c *Header) write(w io.Writer) (int, error) {
+	var buf [20]byte
+	binary.LittleEndian.PutUint32(buf[0:4], magicNumber)
+	binary.LittleEndian.PutUint32(buf[4:8], c.checkSum)
+	binary.LittleEndian.PutUint32(buf[8:12], c.compressor)
+	binary.LittleEndian.PutUint32(buf[12:16], c.compressedSize)
+	binary.LittleEndian.PutUint32(buf[16:20], c.numRecords)
+	return w.Write(buf[:])
+}
+
+func parseHeader(r io.Reader) (*Header, error) {
+	var buf [20]byte
+	if _, e := r.Read(buf[:]); e != nil {
+		return nil, e
+	}
+
+	if v := binary.LittleEndian.Uint32(buf[0:4]); v != magicNumber {
+		return nil, fmt.Errorf("Failed to parse magic number")
+	}
+
+	return &Header{
+		checkSum:       binary.LittleEndian.Uint32(buf[4:8]),
+		compressor:     binary.LittleEndian.Uint32(buf[8:12]),
+		compressedSize: binary.LittleEndian.Uint32(buf[12:16]),
+		numRecords:     binary.LittleEndian.Uint32(buf[16:20]),
+	}, nil
+}
--- a/paddle/go/recordio/reader.go
+++ b/paddle/go/recordio/reader.go
+package recordio
+
+import "io"
+
+// Index consists offsets and sizes of the consequetive chunks in a RecordIO file.
+type Index struct {
+	chunkOffsets []int64
+	chunkLens    []uint32
+	numRecords   int   // the number of all records in a file.
+	chunkRecords []int // the number of records in chunks.
+}
+
+// LoadIndex scans the file and parse chunkOffsets, chunkLens, and len.
+func LoadIndex(r io.ReadSeeker) (*Index, error) {
+	f := &Index{}
+	offset := int64(0)
+	var e error
+	var hdr *Header
+
+	for {
+		hdr, e = parseHeader(r)
+		if e != nil {
+			break
+		}
+
+		f.chunkOffsets = append(f.chunkOffsets, offset)
+		f.chunkLens = append(f.chunkLens, hdr.numRecords)
+		f.chunkRecords = append(f.chunkRecords, int(hdr.numRecords))
+		f.numRecords += int(hdr.numRecords)
+
+		offset, e = r.Seek(int64(hdr.compressedSize), io.SeekCurrent)
+		if e != nil {
+			break
+		}
+	}
+
+	if e == io.EOF {
+		return f, nil
+	}
+	return nil, e
+}
+
+// NumRecords returns the total number of records in a RecordIO file.
+func (r *Index) NumRecords() int {
+	return r.numRecords
+}
+
+// NumChunks returns the total number of chunks in a RecordIO file.
+func (r *Index) NumChunks() int {
+	return len(r.chunkLens)
+}
+
+// ChunkIndex return the Index of i-th Chunk.
+func (r *Index) ChunkIndex(i int) *Index {
+	idx := &Index{}
+	idx.chunkOffsets = []int64{r.chunkOffsets[i]}
+	idx.chunkLens = []uint32{r.chunkLens[i]}
+	idx.chunkRecords = []int{r.chunkRecords[i]}
+	idx.numRecords = idx.chunkRecords[0]
+	return idx
+}
+
+// Locate returns the index of chunk that contains the given record,
+// and the record index within the chunk.  It returns (-1, -1) if the
+// record is out of range.
+func (r *Index) Locate(recordIndex int) (int, int) {
+	sum := 0
+	for i, l := range r.chunkLens {
+		sum += int(l)
+		if recordIndex < sum {
+			return i, recordIndex - sum + int(l)
+		}
+	}
+	return -1, -1
+}
+
+// Scanner scans records in a specified range within [0, numRecords).
+type Scanner struct {
+	reader          io.ReadSeeker
+	index           *Index
+	start, end, cur int
+	chunkIndex      int
+	chunk           *Chunk
+	err             error
+}
+
+// NewScanner creates a scanner that sequencially reads records in the
+// range [start, start+len).  If start < 0, it scans from the
+// beginning.  If len < 0, it scans till the end of file.
+func NewScanner(r io.ReadSeeker, index *Index, start, len int) *Scanner {
+	if start < 0 {
+		start = 0
+	}
+	if len < 0 || start+len >= index.NumRecords() {
+		len = index.NumRecords() - start
+	}
+
+	return &Scanner{
+		reader:     r,
+		index:      index,
+		start:      start,
+		end:        start + len,
+		cur:        start - 1, // The intial status required by Scan.
+		chunkIndex: -1,
+		chunk:      &Chunk{},
+	}
+}
+
+// Scan moves the cursor forward for one record and loads the chunk
+// containing the record if not yet.
+func (s *Scanner) Scan() bool {
+	s.cur++
+
+	if s.cur >= s.end {
+		s.err = io.EOF
+	} else {
+		if ci, _ := s.index.Locate(s.cur); s.chunkIndex != ci {
+			s.chunkIndex = ci
+			s.chunk, s.err = parseChunk(s.reader, s.index.chunkOffsets[ci])
+		}
+	}
+
+	return s.err == nil
+}
+
+// Record returns the record under the current cursor.
+func (s *Scanner) Record() []byte {
+	_, ri := s.index.Locate(s.cur)
+	return s.chunk.records[ri]
+}
+
+// Error returns the error that stopped Scan.
+func (s *Scanner) Error() error {
+	return s.err
+}
--- a/paddle/go/recordio/recordio_internal_test.go
+++ b/paddle/go/recordio/recordio_internal_test.go
+package recordio
+
+import (
+	"bytes"
+	"testing"
+	"unsafe"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestChunkHead(t *testing.T) {
+	assert := assert.New(t)
+
+	c := &Header{
+		checkSum:       123,
+		compressor:     456,
+		compressedSize: 789,
+	}
+
+	var buf bytes.Buffer
+	_, e := c.write(&buf)
+	assert.Nil(e)
+
+	cc, e := parseHeader(&buf)
+	assert.Nil(e)
+	assert.Equal(c, cc)
+}
+
+func TestWriteAndRead(t *testing.T) {
+	assert := assert.New(t)
+
+	data := []string{
+		"12345",
+		"1234",
+		"12"}
+
+	var buf bytes.Buffer
+	w := NewWriter(&buf, 10, NoCompression) // use a small maxChunkSize.
+
+	n, e := w.Write([]byte(data[0])) // not exceed chunk size.
+	assert.Nil(e)
+	assert.Equal(5, n)
+
+	n, e = w.Write([]byte(data[1])) // not exceed chunk size.
+	assert.Nil(e)
+	assert.Equal(4, n)
+
+	n, e = w.Write([]byte(data[2])) // exeeds chunk size, dump and create a new chunk.
+	assert.Nil(e)
+	assert.Equal(n, 2)
+
+	assert.Nil(w.Close()) // flush the second chunk.
+	assert.Nil(w.Writer)
+
+	n, e = w.Write([]byte("anything")) // not effective after close.
+	assert.NotNil(e)
+	assert.Equal(n, 0)
+
+	idx, e := LoadIndex(bytes.NewReader(buf.Bytes()))
+	assert.Nil(e)
+	assert.Equal([]uint32{2, 1}, idx.chunkLens)
+	assert.Equal(
+		[]int64{0,
+			int64(4 + // magic number
+				unsafe.Sizeof(Header{}) +
+				5 + // first record
+				4 + // second record
+				2*4)}, // two record legnths
+		idx.chunkOffsets)
+
+	s := NewScanner(bytes.NewReader(buf.Bytes()), idx, -1, -1)
+	i := 0
+	for s.Scan() {
+		assert.Equal(data[i], string(s.Record()))
+		i++
+	}
+}
+
+func TestWriteEmptyFile(t *testing.T) {
+	assert := assert.New(t)
+
+	var buf bytes.Buffer
+	w := NewWriter(&buf, 10, NoCompression) // use a small maxChunkSize.
+	assert.Nil(w.Close())
+	assert.Equal(0, buf.Len())
+
+	idx, e := LoadIndex(bytes.NewReader(buf.Bytes()))
+	assert.Nil(e)
+	assert.Equal(0, idx.NumRecords())
+}
--- a/paddle/go/recordio/recordio_test.go
+++ b/paddle/go/recordio/recordio_test.go
+package recordio_test
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+
+	"github.com/PaddlePaddle/Paddle/paddle/go/recordio"
+)
+
+func TestWriteRead(t *testing.T) {
+	const total = 1000
+	var buf bytes.Buffer
+	w := recordio.NewWriter(&buf, 0, -1)
+	for i := 0; i < total; i++ {
+		_, err := w.Write(make([]byte, i))
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+	w.Close()
+
+	idx, err := recordio.LoadIndex(bytes.NewReader(buf.Bytes()))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if idx.NumRecords() != total {
+		t.Fatal("num record does not match:", idx.NumRecords(), total)
+	}
+
+	s := recordio.NewScanner(bytes.NewReader(buf.Bytes()), idx, -1, -1)
+	i := 0
+	for s.Scan() {
+		if !reflect.DeepEqual(s.Record(), make([]byte, i)) {
+			t.Fatal("not equal:", len(s.Record()), len(make([]byte, i)))
+		}
+		i++
+	}
+
+	if i != total {
+		t.Fatal("total count not match:", i, total)
+	}
+}
+
+func TestChunkIndex(t *testing.T) {
+	const total = 1000
+	var buf bytes.Buffer
+	w := recordio.NewWriter(&buf, 0, -1)
+	for i := 0; i < total; i++ {
+		_, err := w.Write(make([]byte, i))
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+	w.Close()
+
+	idx, err := recordio.LoadIndex(bytes.NewReader(buf.Bytes()))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if idx.NumChunks() != total {
+		t.Fatal("unexpected chunk num:", idx.NumChunks(), total)
+	}
+
+	for i := 0; i < total; i++ {
+		newIdx := idx.ChunkIndex(i)
+		s := recordio.NewScanner(bytes.NewReader(buf.Bytes()), newIdx, -1, -1)
+		j := 0
+		for s.Scan() {
+			if !reflect.DeepEqual(s.Record(), make([]byte, i)) {
+				t.Fatal("not equal:", len(s.Record()), len(make([]byte, i)))
+			}
+			j++
+		}
+		if j != 1 {
+			t.Fatal("unexpected record per chunk:", j)
+		}
+	}
+}
--- a/paddle/go/recordio/writer.go
+++ b/paddle/go/recordio/writer.go
+package recordio
+
+import (
+	"fmt"
+	"io"
+)
+
+const (
+	defaultMaxChunkSize = 32 * 1024 * 1024
+)
+
+// Writer creates a RecordIO file.
+type Writer struct {
+	io.Writer    // Set to nil to mark a closed writer.
+	chunk        *Chunk
+	maxChunkSize int // total records size, excluding metadata, before compression.
+	compressor   int
+}
+
+// NewWriter creates a RecordIO file writer.  Each chunk is compressed
+// using the deflate algorithm given compression level.  Note that
+// level 0 means no compression and -1 means default compression.
+func NewWriter(w io.Writer, maxChunkSize, compressor int) *Writer {
+	if maxChunkSize < 0 {
+		maxChunkSize = defaultMaxChunkSize
+	}
+
+	if compressor < 0 {
+		compressor = defaultCompressor
+	}
+
+	return &Writer{
+		Writer:       w,
+		chunk:        &Chunk{},
+		maxChunkSize: maxChunkSize,
+		compressor:   compressor}
+}
+
+// Writes a record.  It returns an error if Close has been called.
+func (w *Writer) Write(record []byte) (int, error) {
+	if w.Writer == nil {
+		return 0, fmt.Errorf("Cannot write since writer had been closed")
+	}
+
+	if w.chunk.numBytes+len(record) > w.maxChunkSize {
+		if e := w.chunk.dump(w.Writer, w.compressor); e != nil {
+			return 0, e
+		}
+	}
+
+	w.chunk.add(record)
+	return len(record), nil
+}
+
+// Close flushes the current chunk and makes the writer invalid.
+func (w *Writer) Close() error {
+	e := w.chunk.dump(w.Writer, w.compressor)
+	w.Writer = nil
+	return e
+}
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -396,6 +396,44 @@ Error __must_check backward(Argument& act) {
 }
 END_DEFINE_ACTIVATION(exponential)

+/**
+ * @brief Reciprocal Activation.
+ * \f[
+ * f(z) = 1/z
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(reciprocal)
+Error __must_check forward(Argument& act) {
+  act.value->reciprocal2();
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotMulSquare(*act.value);
+  act.grad->neg();
+  return Error();
+}
+END_DEFINE_ACTIVATION(reciprocal)
+
+/**
+ * @brief Square Root Activation.
+ * \f[
+ * f(z) = sqrt(z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(sqrt)
+Error __must_check forward(Argument& act) {
+  act.value->sqrt2();
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.value);
+  act.grad->mulScalar(0.5);
+  return Error();
+}
+END_DEFINE_ACTIVATION(sqrt)
+
 /**
 * @brief Logarithm Activation.
 * \f[

--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>

 #include "paddle/math/Vector.h"
+#include "paddle/utils/StringUtil.h"

 #include "Evaluator.h"

@@ -74,6 +75,7 @@ class ChunkEvaluator : public Evaluator {
  std::vector<Segment> labelSegments_;
  std::vector<Segment> outputSegments_;
  std::set<int> excludedChunkTypes_;
+  mutable std::unordered_map<std::string, real> values_;

 public:
  virtual void init(const EvaluatorConfig& config) {
@@ -121,11 +123,9 @@ public:
  }

  virtual void printStats(std::ostream& os) const {
-    double precision = (double)numCorrect_ / numOutputSegments_;
-    double recall = (double)numCorrect_ / numLabelSegments_;
-    double f1 =
-        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
-    os << config_.name() << "=" << f1 << " true_chunks=" << numLabelSegments_
+    storeLocalValues();
+    os << config_.name() << "=" << values_["F1-score"]
+       << " true_chunks=" << numLabelSegments_
       << " result_chunks=" << numOutputSegments_
       << " correct_chunks=" << numCorrect_;
  }
@@ -243,6 +243,46 @@ public:
    if (tag == tagSingle_) return true;
    return false;
  }
+
+  // three metrics: precision, recall and F1-score
+  void getNames(std::vector<std::string>* names) {
+    storeLocalValues();
+    names->reserve(names->size() + values_.size());
+    for (auto it = values_.begin(); it != values_.end(); ++it) {
+      names->push_back(config_.name() + "." + it->first);
+    }
+  }
+
+  // get value by field name
+  real getValue(const std::string& name, Error* err) const {
+    storeLocalValues();
+    std::vector<std::string> buffers;
+    paddle::str::split(name, '.', &buffers);
+    auto it = values_.find(buffers.back());
+    if (it == values_.end()) {  // not found
+      *err = Error("No such key %s", name.c_str());
+      return 0.0f;
+    }
+
+    return it->second;
+  }
+
+  // get type of evaluator
+  std::string getTypeImpl() const { return "chunk"; }
+
+private:
+  void storeLocalValues() const {
+    CHECK_GE(numOutputSegments_, 0);
+    CHECK_GE(numLabelSegments_, 0);
+    double precision =
+        !numOutputSegments_ ? 0 : (double)numCorrect_ / numOutputSegments_;
+    double recall =
+        !numLabelSegments_ ? 0 : (double)numCorrect_ / numLabelSegments_;
+    values_["precision"] = precision;
+    values_["recall"] = recall;
+    values_["F1-score"] =
+        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
+  }
 };

 REGISTER_EVALUATOR(chunk, ChunkEvaluator);

--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -180,7 +180,6 @@ int getri<double>(const CBLAS_ORDER order,
                  const int lda,
                  const int* ipiv) {
  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
-  return 0;
 }

 template <>

--- a/paddle/math/TensorEvaluate.h
+++ b/paddle/math/TensorEvaluate.h
@@ -103,7 +103,10 @@ inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
 }
 #else
 template <class T, typename LeftType, typename RightType>
-inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {}
+inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {
+  LOG(FATAL) << "Since it is gcc compiled, "
+                "this calculation does not support GPU implementation.";
+}
 #endif

 }  // namespace paddle
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -161,6 +161,7 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
                                        const ParameterConfig& config,
                                        size_t sparseId) const {
  CHECK(sparseId == -1LU) << "Sparse update is not supported";
+
  BaseMatrix& value = *vecs[PARAMETER_VALUE];
  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
@@ -265,6 +266,7 @@ void AdamParameterOptimizer::update(const VectorPtr vecs[],
                                    const ParameterConfig& config,
                                    size_t sparseId) const {
  CHECK(sparseId == -1UL) << "Sparse update is not supported";
+
  real beta1_power = std::pow(beta1_, step_);
  real beta2_power = std::pow(beta2_, step_);
  real learningRate = config.learning_rate() * learningRate_;
@@ -303,18 +305,25 @@ void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
 void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
                                           const ParameterConfig& config,
                                           size_t sparseId) const {
+  real globalThreshold = optConfig_.gradient_clipping_threshold();
+  real localThreshold = config.gradient_clipping_threshold();
+
+  // Use local gradient clipping threshold if it's enabled,
+  // otherwise using the global one.
+  real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold;
+  std::string field = localThreshold > 0.0f ? "local" : "global";
+
  real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
-  if (maxAbsGrad > config.gradient_clipping_threshold()) {
+  if (maxAbsGrad > threshold) {
    if (FLAGS_log_clipping) {
      real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
                        vecs[PARAMETER_GRADIENT]->getSize();
-      LOG(INFO) << "parameter=" << config.name() << " need clipping,"
-                << " max grad=" << maxAbsGrad << " avg grad=" << avgAbsGrad;
+      LOG(INFO) << "parameter=" << config.name() << " need clipping by "
+                << field << " threshold=" << threshold
+                << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
    }
-    vecs[PARAMETER_GRADIENT]->clip(-config.gradient_clipping_threshold(),
-                                   config.gradient_clipping_threshold());
+    vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
  }
-
  optimizer_->update(vecs, config, sparseId);
 }


--- a/paddle/parameter/OptimizerWithRegularizer.cpp
+++ b/paddle/parameter/OptimizerWithRegularizer.cpp
@@ -131,7 +131,8 @@ ParameterOptimizer* OptimizerWithRegularizer::create(
    bool inPserver) {
  ParameterOptimizer* optimizer =
      ParameterOptimizer::create(optConfig, inPserver);
-  if (paraConfig.gradient_clipping_threshold() > 0.0f &&
+  if ((optConfig.gradient_clipping_threshold() > 0.0f ||
+       paraConfig.gradient_clipping_threshold() > 0.0f) &&
      !dynamic_cast<AddOptimizer*>(optimizer)) {
    optimizer = new OptimizerWithGradientClipping(optConfig, optimizer);
  }

--- a/paddle/parameter/ParameterOptimizer.h
+++ b/paddle/parameter/ParameterOptimizer.h
@@ -167,6 +167,7 @@ public:
    }
    parameterTypes_.push_back(type);
  }
+
  real getLearningRate() const { return learningRate_; }

  virtual void setNoDecay() { applyDecay_ = false; }
@@ -201,6 +202,7 @@ protected:
   * so, if lr change in StartBatch, please assign to learningRate_
   */
  real learningRate_;
+
  std::unique_ptr<LearningRateScheduler> learningRateScheduler_;
  int64_t pass_;  // current training pass (starting from 0)
  bool firstTime_;

--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
+#!/bin/bash
+
+set -xe
+
+mkdir -p /paddle/build
+cd /paddle/build
+rm -f /paddle/install 2>/dev/null || true
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DHOST_C_COMPILER=/usr/bin/gcc \
+      -DHOST_CXX_COMPILER=/usr/bin/g++ \
+      -DCMAKE_INSTALL_PREFIX=/paddle/install \
+      -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+      -DCMAKE_C_FLAGS_RELWITHDEBINFO="-O3" \
+      -DCMAKE_CXX_FLAGS_RELWITHDEBINFO="-O3" \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+make -j `nproc`
+make install
+
+export PATH=/paddle/install/bin:/paddle/install/opt/paddle/bin:$PATH
+paddle version
--- a/proto/TrainerConfig.proto
+++ b/proto/TrainerConfig.proto
@@ -128,6 +128,9 @@ message OptimizationConfig {
  // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
  // current async gradient will be discard silently.
  optional double async_lagged_grad_discard_ratio = 37 [default = 1.5];
+
+  // global threshold for gradient clipping 
+  optional double gradient_clipping_threshold = 38 [default = 0.0];
 };

 message TrainerConfig {

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3377,6 +3377,7 @@ settings = dict(
    algorithm='async_sgd',
    async_lagged_grad_discard_ratio=1.5,
    learning_method='momentum',
+    gradient_clipping_threshold=None,
    num_batches_per_send_parameter=None,
    num_batches_per_get_parameter=None,
    center_parameter_update_method=None,

--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -17,7 +17,7 @@ __all__ = [
    "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
    'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
    "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
-    "LogActivation"
+    "LogActivation", "SqrtActivation", "ReciprocalActivation"
 ]


@@ -224,3 +224,27 @@ class LogActivation(BaseActivation):

    def __init__(self):
        BaseActivation.__init__(self, 'log', False)
+
+
+class SqrtActivation(BaseActivation):
+    """
+    Square Root Activation.
+
+    .. math::
+       f(z) = sqrt(z)
+    """
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'sqrt', False)
+
+
+class ReciprocalActivation(BaseActivation):
+    """
+    Reciprocal Activation.
+
+    .. math::
+       f(z) = 1/z
+    """
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'reciprocal', False)
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -347,32 +347,71 @@ def chunk_evaluator(
        excluded_chunk_types=None, ):
    """
    Chunk evaluator is used to evaluate segment labelling accuracy for a
-    sequence. It calculates the chunk detection F1 score.
+    sequence. It calculates precision, recall and F1 scores for the chunk detection.

-    A chunk is correctly detected if its beginning, end and type are correct.
-    Other chunk type is ignored.
+    To use chunk evaluator, several concepts need to be clarified firstly.

-    For each label in the label sequence, we have:
+    * **Chunk type** is the type of the whole chunk and a chunk consists of one or several words.  (For example in NER, ORG for organization name, PER for person name etc.)

-    .. code-block:: python
+    * **Tag type** indicates the position of a word in a chunk. (B for begin, I for inside, E for end, S for single)
+    We can name a label by combining tag type and chunk type. (ie. B-ORG for begining of an organization name)

-       tagType = label % numTagType
-       chunkType = label / numTagType
-       otherChunkType = numChunkTypes
+    The construction of label dictionary should obey the following rules:

-    The total number of different labels is numTagType*numChunkTypes+1.
-    We support 4 labelling scheme.
-    The tag type for each of the scheme is shown as follows:
+    - Use one of the listed labelling schemes. These schemes differ in ways indicating chunk boundry.

-    .. code-block:: python
+    .. code-block:: text
+
+        Scheme    Description                                                                                  
+        plain    Use the same label for the whole chunk.
+        IOB      Two labels for chunk type X, B-X for chunk begining and I-X for chunk inside. 
+        IOE      Two labels for chunk type X, E-X for chunk ending and I-X for chunk inside.
+        IOBES    Four labels for chunk type X, B-X for chunk begining, I-X for chunk inside, E-X for chunk end and S-X for single word chunk. 
+   
+    To make it clear, let's illustrate by an NER example.
+    Assuming that there are three named entity types including ORG, PER and LOC which are called 'chunk type' here,
+    if 'IOB' scheme were used, the label set will be extended to a set including B-ORG, I-ORG, B-PER, I-PER, B-LOC, I-LOC and O,
+    in which B-ORG for begining of ORG and I-ORG for inside of ORG.
+    Prefixes which are called 'tag type' here are added to chunk types and there are two tag types including B and I.
+    Of course, the training data should be labeled accordingly.
+
+    - Mapping is done correctly by the listed equations and assigning protocol.
+
+    The following table are equations to extract tag type and chunk type from a label.
+
+    .. code-block:: text
+
+        tagType = label % numTagType
+        chunkType = label / numTagType
+        otherChunkType = numChunkTypes
+    
+    The following table shows the mapping rule between tagType and tag type in each scheme.
+
+    .. code-block:: text
+
+        Scheme Begin Inside End   Single
+        plain  0     -      -     -
+        IOB    0     1      -     -
+        IOE    -     0      1     -
+        IOBES  0     1      2     3
+
+    Continue the NER example, and the label dict should look like this to satify above equations:
+
+    .. code-block:: text

-       Scheme Begin Inside End   Single
-       plain  0     -      -     -
-       IOB    0     1      -     -
-       IOE    -     0      1     -
-       IOBES  0     1      2     3
+        B-ORG  0
+        I-ORG  1
+        B-PER  2
+        I-PER  3
+        B-LOC  4
+        I-LOC  5
+        O      6

-    'plain' means the whole chunk must contain exactly the same chunk label.
+    In this example, chunkType has three values: 0 for ORG, 1 for PER, 2 for LOC, because the scheme is
+    "IOB" so tagType has two values: 0 for B and 1 for I. 
+    Here we will use I-LOC to explain the above mapping rules in detail.
+    For I-LOC, the label id is 5, so we can get tagType=1 and chunkType=2, which means I-LOC is a part of NER chunk LOC
+    and the tag is I.

    The simple usage is:

@@ -380,6 +419,7 @@ def chunk_evaluator(

       eval = chunk_evaluator(input, label, chunk_scheme, num_chunk_types)

+    
    :param input: The input layers.
    :type input: LayerOutput
    :param label: An input layer containing the ground truth label.

--- a/python/paddle/trainer_config_helpers/layer_math.py
+++ b/python/paddle/trainer_config_helpers/layer_math.py
@@ -40,6 +40,8 @@ register_unary_math_op('sigmoid', act.SigmoidActivation())
 register_unary_math_op('tanh', act.TanhActivation())
 register_unary_math_op('square', act.SquareActivation())
 register_unary_math_op('relu', act.ReluActivation())
+register_unary_math_op('sqrt', act.SqrtActivation())
+register_unary_math_op('reciprocal', act.ReciprocalActivation())


 def add(layeroutput, other):

--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -408,7 +408,8 @@ def settings(batch_size,

    args = [
        'batch_size', 'learning_rate', 'learning_rate_decay_a',
-        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args'
+        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args',
+        'gradient_clipping_threshold'
    ]
    kwargs = dict()
    kwargs['algorithm'] = algorithm

--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -4,6 +4,8 @@ settings(batch_size=1000, learning_rate=1e-5)

 x = data_layer(name='data', size=100)
 x = layer_math.exp(x)
+x = layer_math.sqrt(x)
+x = layer_math.reciprocal(x)
 x = layer_math.log(x)
 x = layer_math.abs(x)
 x = layer_math.sigmoid(x)

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
@@ -20,13 +20,43 @@ layers {
    }
  }
 }
+layers {
+  name: "__sqrt_0__"
+  type: "mixed"
+  size: 100
+  active_type: "sqrt"
+  inputs {
+    input_layer_name: "__exp_0__"
+    proj_conf {
+      type: "identity"
+      name: "___sqrt_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__reciprocal_0__"
+  type: "mixed"
+  size: 100
+  active_type: "reciprocal"
+  inputs {
+    input_layer_name: "__sqrt_0__"
+    proj_conf {
+      type: "identity"
+      name: "___reciprocal_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
 layers {
  name: "__log_0__"
  type: "mixed"
  size: 100
  active_type: "log"
  inputs {
-    input_layer_name: "__exp_0__"
+    input_layer_name: "__reciprocal_0__"
    proj_conf {
      type: "identity"
      name: "___log_0__.w0"
@@ -351,6 +381,8 @@ sub_models {
  name: "root"
  layer_names: "data"
  layer_names: "__exp_0__"
+  layer_names: "__sqrt_0__"
+  layer_names: "__reciprocal_0__"
  layer_names: "__log_0__"
  layer_names: "__abs_0__"
  layer_names: "__sigmoid_0__"

--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MQ2007 dataset
+
+MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross
+validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set,
+validation set and testing set.
+
+MQ2007 dataset from website
+http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators
+
+"""
+
+import os
+import random
+import functools
+import rarfile
+from common import download
+import numpy as np
+
+# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
+URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
+MD5 = "7be1640ae95c6408dab0ae7207bdc706"
+
+
+def __initialize_meta_info__():
+    """
+  download and extract the MQ2007 dataset
+  """
+    fn = fetch()
+    rar = rarfile.RarFile(fn)
+    dirpath = os.path.dirname(fn)
+    rar.extractall(path=dirpath)
+    return dirpath
+
+
+class Query(object):
+    """
+  queries used for learning to rank algorithms. It is created from relevance scores,  query-document feature vectors
+
+  Parameters:
+  ----------
+  query_id : int
+    query_id in dataset, mapping from query to relevance documents
+  relevance_score : int 
+    relevance score of query and document pair
+  feature_vector : array, dense feature
+    feature in vector format
+  description : string
+    comment section in query doc pair data
+  """
+
+    def __init__(self,
+                 query_id=-1,
+                 relevance_score=-1,
+                 feature_vector=None,
+                 description=""):
+        self.query_id = query_id
+        self.relevance_score = relevance_score
+        if feature_vector is None:
+            self.feature_vector = []
+        else:
+            self.feature_vector = feature_vector
+        self.description = description
+
+    def __str__(self):
+        string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
+                               " ".join(str(f) for f in self.feature_vector))
+        return string
+
+    # @classmethod
+    def _parse_(self, text):
+        """
+    parse line into Query
+    """
+        comment_position = text.find('#')
+        line = text[:comment_position].strip()
+        self.description = text[comment_position + 1:].strip()
+        parts = line.split()
+        if len(parts) != 48:
+            sys.stdout.write("expect 48 space split parts, get %d" %
+                             (len(parts)))
+            return None
+        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        self.relevance_score = int(parts[0])
+        self.query_id = int(parts[1].split(':')[1])
+        for p in parts[2:]:
+            pair = p.split(':')
+            self.feature_vector.append(float(pair[1]))
+        return self
+
+
+class QueryList(object):
+    """
+  group query into list, every item in list is a Query
+  """
+
+    def __init__(self, querylist=None):
+        self.query_id = -1
+        if querylist is None:
+            self.querylist = []
+        else:
+            self.querylist = querylist
+            for query in self.querylist:
+                if self.query_id == -1:
+                    self.query_id = query.query_id
+                else:
+                    if self.query_id != query.query_id:
+                        raise ValueError("query in list must be same query_id")
+
+    def __iter__(self):
+        for query in self.querylist:
+            yield query
+
+    def __len__(self):
+        return len(self.querylist)
+
+    def __getitem__(self, i):
+        return self.querylist[i]
+
+    def _correct_ranking_(self):
+        if self.querylist is None:
+            return
+        self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
+
+    def _add_query(self, query):
+        if self.query_id == -1:
+            self.query_id = query.query_id
+        else:
+            if self.query_id != query.query_id:
+                raise ValueError("query in list must be same query_id")
+        self.querylist.append(query)
+
+
+def gen_plain_txt(querylist):
+    """
+  gen plain text in list for other usage
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  query_id : np.array, shape=(samples_num, )
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+    """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield querylist.query_id, query.relevance_score, np.array(
+            query.feature_vector)
+
+
+def gen_point(querylist):
+    """
+  gen item in list for point-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield query.relevance_score, np.array(query.feature_vector)
+
+
+def gen_pair(querylist, partial_order="full"):
+    """
+  gen pair for pair-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+  pairtial_order : "full" or "neighbour"
+    there is redudant in all possiable pair combinations, which can be simplifed
+  gen pairs for neighbour items or the full partial order pairs
+
+  return :
+  ------
+  label : np.array, shape=(1)
+  query_left : np.array, shape=(1, feature_dimension)
+  query_right : same as left
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    labels = []
+    docpairs = []
+
+    # C(n,2)
+    for i in range(len(querylist)):
+        query_left = querylist[i]
+        for j in range(i + 1, len(querylist)):
+            query_right = querylist[j]
+            if query_left.relevance_score > query_right.relevance_score:
+                labels.append(1)
+                docpairs.append([
+                    np.array(query_left.feature_vector),
+                    np.array(query_right.feature_vector)
+                ])
+            elif query_left.relevance_score < query_right.relevance_score:
+                labels.append(1)
+                docpairs.append([
+                    np.array(query_right.feature_vector),
+                    np.array(query_left.feature_vector)
+                ])
+    for label, pair in zip(labels, docpairs):
+        yield label, pair[0], pair[1]
+
+
+def gen_list(querylist):
+    """
+  gen item in list for list-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    relevance_score_list = [query.relevance_score for query in querylist]
+    feature_vector_list = [query.feature_vector for query in querylist]
+    yield np.array(relevance_score_list).T, np.array(feature_vector_list)
+
+
+def query_filter(querylists):
+    """
+    filter query get only document with label 0.
+    label 0, 1, 2 means the relevance score document with query
+    parameters :
+      querylist : QueyList list
+
+    return :
+      querylist : QueyList list
+    """
+    filter_query = []
+    for querylist in querylists:
+        relevance_score_list = [query.relevance_score for query in querylist]
+        if sum(relevance_score_list) != .0:
+            filter_query.append(querylist)
+    return filter_query
+
+
+def load_from_text(filepath, shuffle=True, fill_missing=-1):
+    """
+  parse data file into querys
+  """
+    prev_query_id = -1
+    querylists = []
+    querylist = None
+    fn = __initialize_meta_info__()
+    with open(os.path.join(fn, filepath)) as f:
+        for line in f:
+            query = Query()
+            query = query._parse_(line)
+            if query == None:
+                continue
+            if query.query_id != prev_query_id:
+                if querylist is not None:
+                    querylists.append(querylist)
+                querylist = QueryList()
+                prev_query_id = query.query_id
+            querylist._add_query(query)
+    if querylist is not None:
+        querylists.append(querylist)
+    if shuffle == True:
+        random.shuffle(querylists)
+    return querylists
+
+
+def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1):
+    """
+  Parameters
+  --------
+  filename : string
+  shuffle : shuffle query-doc pair under the same query
+  fill_missing : fill the missing value. default in MQ2007 is -1
+  
+  Returns
+  ------
+  yield
+    label query_left, query_right  # format = "pairwise"
+    label querylist # format = "listwise"
+  """
+    querylists = query_filter(
+        load_from_text(
+            filepath, shuffle=shuffle, fill_missing=fill_missing))
+    for querylist in querylists:
+        if format == "plain_txt":
+            yield next(gen_plain_txt(querylist))
+        elif format == "pointwise":
+            yield next(gen_point(querylist))
+        elif format == "pairwise":
+            for pair in gen_pair(querylist):
+                yield pair
+        elif format == "listwise":
+            yield next(gen_list(querylist))
+
+
+train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
+test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
+
+
+def fetch():
+    return download(URL, "MQ2007", MD5)
+
+
+if __name__ == "__main__":
+    fetch()
+    mytest = functools.partial(
+        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
+    for label, query in mytest():
+        print label, query
--- a/python/paddle/v2/dataset/tests/mq2007_test.py
+++ b/python/paddle/v2/dataset/tests/mq2007_test.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.mq2007
+import unittest
+
+
+class TestMQ2007(unittest.TestCase):
+    def test_pairwise(self):
+        for label, query_left, query_right in paddle.v2.dataset.mq2007.test(
+                format="pairwise"):
+            self.assertEqual(query_left.shape(), (46, ))
+            self.assertEqual(query_right.shape(), (46, ))
+
+    def test_listwise(self):
+        for label_array, query_array in paddle.v2.dataset.mq2007.test(
+                format="listwise"):
+            self.assertEqual(len(label_array), len(query_array))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -177,7 +177,7 @@ class SGD(object):
        Testing method. Will test input data.

        :param reader: A reader that reads and yeilds data items.
-        :type reader: collections.Iterable  
+        :type reader: collections.Iterable
        :param feeding: Feeding is a map of neural network input name and array
                        index that reader returns.
        :type feeding: dict