diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 22a26d7c5b04ba1f45de5ec9f3387c539ade730b..3ca735189da70ca826099843acf4528ee271e02f 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -57,8 +57,11 @@ function(cc_binary TARGET_NAME)
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   add_executable(${TARGET_NAME} ${cc_binary_SRCS})
-  add_dependencies(${TARGET_NAME} ${cc_binary_DEPS} ${external_project_dependencies})
-  target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
+  link_paddle_exe(${TARGET_NAME})
+  if(cc_binary_DEPS)  
+    target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
+    add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
+  endif()
 endfunction(cc_binary)
 
 # The dependency to target tensor implies that if any of
@@ -74,8 +77,11 @@ function(cc_test TARGET_NAME)
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   add_executable(${TARGET_NAME} ${cc_test_SRCS})
-  add_dependencies(${TARGET_NAME} ${cc_test_DEPS} ${external_project_dependencies})
-  target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES})
+  link_paddle_test(${TARGET_NAME})
+  if(cc_test_DEPS)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS})
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS})
+  endif()
   add_test(${TARGET_NAME} ${TARGET_NAME})
 endfunction(cc_test)
 
@@ -106,8 +112,11 @@ function(nv_binary TARGET_NAME)
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
-  add_dependencies(${TARGET_NAME} ${nv_binary_DEPS} ${external_project_dependencies})
-  target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
+  link_paddle_exe(${TARGET_NAME})  
+  if(nv_binary_DEPS)
+    target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
+    add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
+  endif()
 endfunction(nv_binary)
 
 # The dependency to target tensor implies that if any of
@@ -123,7 +132,10 @@ function(nv_test TARGET_NAME)
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-  add_dependencies(${TARGET_NAME} ${nv_test_DEPS} ${external_project_dependencies})
-  target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES})
+  link_paddle_test(${TARGET_NAME})  
+  if(nv_test_DEPS)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS})
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS})
+  endif()
   add_test(${TARGET_NAME} ${TARGET_NAME})
 endfunction(nv_test)
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 154cfe24432f3e43ed724a45273b4a582b45f73d..1efa74ecda4170332d96603ca2253c68468474f9 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -207,6 +207,11 @@ trans_full_matrix_projection
 Aggregate Layers
 ================
 
+AggregateLevel
+--------------
+..  autoclass:: paddle.v2.layer.AggregateLevel
+    :noindex:
+
 ..  _api_v2.layer_pooling:
 
 pooling
@@ -248,6 +253,11 @@ block_expand
 
 ..  _api_v2.layer_expand:
 
+ExpandLevel
+-----------
+..  autoclass:: paddle.v2.layer.ExpandLevel
+    :noindex:
+
 expand
 ------
 ..  autoclass:: paddle.v2.layer.expand
diff --git a/doc/design/build_system/README.md b/doc/design/build_system/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..310739f37ae48934afe1d042e87efef85b98f1fc
--- /dev/null
+++ b/doc/design/build_system/README.md
@@ -0,0 +1,107 @@
+A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++.
+
+Here are some initial thoughts. Your comments are welcome!
+
+### Required CMake Function
+
+I think we need only the following few CMake functions to make a project description mean and clean:
+
+| C++ | CUDA C++ | Go |
+|---|---|---|
+| cc_library | nv_library | go_library |
+| cc_binary | nv_binary | go_binary |
+| cc_test | nv_test | go_test |
+
+- The `_library` functions generate  .a files from source code.
+- The `_binary` functions generate executable binary files.
+- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`.
+
+The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler.
+
+Both `nv_` and `cc_` functions enables C++11 (-std=c++11).
+
+Also,
+
+- to describe external dependencies, we need `external_library`.
+- to build shared libraries, we need `shared_library`.
+
+### An Example Project
+
+Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
+
+- tensor.h
+- tensor.cc
+- tensor_test.cc
+- ops.h
+- ops.cu
+- ops_test.cu
+- api.go
+- api_test.go
+
+Suppose that ops.cu depends on CUDNN.
+
+```cmake
+# cc_binary parses tensor.cc and figures out that target also depend
+# on tensor.h.
+cc_binary(tensor
+  SRCS
+  tensor.cc)
+
+# The dependency to target tensor implies that if any of
+# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
+cc_test(tensor_test
+  SRCS
+  tensor_test.cc
+  DEPS
+  tensor)
+
+# I don't have a clear idea what parameters external_library need to
+# have.  @gangliao as a CMake expert would have better ideas.
+external_library(cudnn
+  ....)
+
+# Suppose that ops.cu depends on external target CUDNN.  Also, ops.cu
+# include global functions that take Tensor as their parameters, so
+# ops depend on tensor.  This implies that if any of tensor.{h.cc},
+# ops.{h,cu} is changed, ops need to be re-built.
+nv_library(ops
+  SRCS
+  ops.cu
+  DEPS
+  tensor
+  cudnn)  # cudnn is defined later.
+
+nv_test(ops_test
+  SRCS
+  ops_test.cu
+  DEPS
+  ops)
+
+# Because api.go defines a GO wrapper to ops and tensor, it depends on
+# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
+# api.go is changed, api need to be re-built.
+go_library(api
+  SRCS
+  api.go
+  DEPS
+  tensor # Because ops depend on tensor, this line is optional.
+  ops)
+
+go_test(api_test
+  SRCS
+  api_test.go
+  DEPS
+  api)
+
+
+# This builds libapi.so.  shared_library might use CMake target
+# api_shared so to distinguish it from above target api.
+shared_library(api
+  DEPS
+  api)
+
+```
+
+### Implementation
+
+As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index c9a285c90b0674e175c592c40fa26a2222ed0f51..e147659566dba6cfbfd677e3b616bdaa4a73485c 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -26,10 +26,7 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
 
 SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -Wall")
-IF(WITH_COVERAGE)
-    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
-ENDIF(WITH_COVERAGE)
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign")
 
 SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
     paddle_parameter
diff --git a/paddle/go/cclient/CMakeLists.txt b/paddle/go/cclient/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..29a2089fb10baf24ca4ab77675987c41cbea1c37
--- /dev/null
+++ b/paddle/go/cclient/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required(VERSION 3.0)
+
+if(GTEST_INCLUDE_DIR AND GTEST_LIBRARIES)
+  message("-- Found gtest (include: ${GTEST_INCLUDE_DIR}, library: ${GTEST_LIBRARIES})")
+else()	
+  # find #include <majel/xx.h>
+  get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+  include_directories(${PARENT_DIR})
+
+  # find cmake directory modules
+  get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
+  get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
+
+  set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
+
+  # enable c++11
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+  # enable gtest
+  set(THIRD_PARTY_PATH ./third_party)
+  set(WITH_TESTING ON)
+  include(external/gtest)
+endif()
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+project(cxx_go CXX C Go)
+
+include(cmake/golang.cmake)
+include(cmake/flags.cmake)
+
+ExternalGoProject_Add(pserver github.com/PaddlePaddle/Paddle/paddle/go/pserver)
+add_go_library(client STATIC pserver)
+add_subdirectory(test)
diff --git a/paddle/go/cclient/cclient.go b/paddle/go/cclient/cclient.go
new file mode 100644
index 0000000000000000000000000000000000000000..dc86d47e8d0a97e3d78b174f84add8b9a3730f1f
--- /dev/null
+++ b/paddle/go/cclient/cclient.go
@@ -0,0 +1,239 @@
+package main
+
+/*
+#include <stdlib.h>
+#include <string.h>
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32   = 0,
+  PADDLE_ELEMENT_TYPE_UINT32  = 1,
+  PADDLE_ELEMENT_TYPE_INT64   = 2,
+  PADDLE_ELEMENT_TYPE_UINT64  = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+
+typedef struct {
+  char*               name;
+  paddle_element_type element_type;
+  unsigned char*      content;
+  int                 content_len;
+} paddle_parameter, paddle_gradient;
+
+static inline void paddle_release_param(paddle_parameter* param) {
+  if (param != NULL) {
+    if (param->name != NULL) {
+      free(param->name);
+    }
+
+    if (param->content != NULL) {
+      free(param->content);
+    }
+
+    free(param);
+  }
+}
+
+typedef int client;
+*/
+import "C"
+
+import (
+	"log"
+	"sync"
+	"unsafe"
+
+	"github.com/PaddlePaddle/Paddle/paddle/go/pserver"
+)
+
+var nullPtr = unsafe.Pointer(uintptr(0))
+var mu sync.Mutex
+var handleMap = make(map[C.client]*pserver.Client)
+var curHandle C.client
+
+func add(c *pserver.Client) C.client {
+	mu.Lock()
+	defer mu.Unlock()
+	client := curHandle
+	curHandle++
+	handleMap[client] = c
+	return client
+}
+
+func get(client C.client) *pserver.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	return handleMap[client]
+}
+
+func remove(client C.client) *pserver.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	h := handleMap[client]
+	delete(handleMap, client)
+	return h
+}
+
+func cArrayToSlice(p unsafe.Pointer, len int) []byte {
+	if p == nullPtr {
+		return nil
+	}
+
+	// create a Go clice backed by a C array,
+	// reference: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+	return (*[1 << 30]byte)(p)[:len:len]
+}
+
+//export paddle_new_pserver_client
+func paddle_new_pserver_client(addr *C.char) C.client {
+	c := pserver.NewClient(C.GoString(addr))
+	return add(c)
+}
+
+//export paddle_pserver_client_release
+func paddle_pserver_client_release(client C.client) {
+	c := remove(client)
+	c.Cleanup()
+}
+
+//export paddle_begin_init_params
+func paddle_begin_init_params(client C.client, pserver_config unsafe.Pointer, config_len C.int) C.int {
+	c := get(client)
+	b := cArrayToSlice(pserver_config, int(config_len))
+	selected, err := c.BeginInitParams(b)
+	if err != nil {
+		log.Println(err)
+		return -1
+	}
+
+	if selected {
+		return 1
+	}
+	return 0
+}
+
+//export paddle_init_param
+func paddle_init_param(client C.client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+	et := pserver.ElementType(param.element_type)
+	name := C.GoString(param.name)
+	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
+	pc := pserver.ParameterWithConfig{
+		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
+		Config: cArrayToSlice(param_config, int(config_len)),
+	}
+	c := get(client)
+	err := c.InitParam(pc)
+	if err != nil {
+		log.Println(err)
+		return -1
+	}
+
+	return 0
+}
+
+//export paddle_finish_init_params
+func paddle_finish_init_params(client C.client) C.int {
+	c := get(client)
+	err := c.FinishInitParams()
+	if err != nil {
+		log.Println(err)
+		return -1
+	}
+
+	return 0
+}
+
+//export paddle_send_grads
+func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C.int {
+	var gs []pserver.Gradient
+	for i := 0; i < int(total); i++ {
+		grad := (*C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
+		et := pserver.ElementType(grad.element_type)
+		name := C.GoString(grad.name)
+		content := cArrayToSlice(unsafe.Pointer(grad.content), int(grad.content_len))
+		gs = append(gs, pserver.Gradient{Name: name, ElementType: et, Content: content})
+	}
+
+	c := get(client)
+	err := c.SendGrads(gs)
+	if err != nil {
+		log.Println(err)
+		return -1
+	}
+
+	return 0
+}
+
+//export paddle_get_params
+func paddle_get_params(client C.client, names **C.char, dst **C.paddle_parameter, total C.int) C.int {
+	var ns []string
+	for i := 0; i < int(total); i++ {
+		name := *(**C.char)(unsafe.Pointer((uintptr(unsafe.Pointer(names)) + uintptr(i)*unsafe.Sizeof(*names))))
+		ns = append(ns, C.GoString(name))
+	}
+	c := get(client)
+	ps, err := c.GetParams(ns)
+	if err != nil {
+		log.Println(err)
+		return -1
+	}
+
+	for i := 0; i < int(total); i++ {
+		if i >= len(ps) {
+			break
+		}
+
+		p := ps[i]
+		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
+		nameReady := false
+		contentAllocated := false
+
+		if unsafe.Pointer(param) == nullPtr {
+			param = (*C.paddle_parameter)(C.calloc(1, C.size_t(unsafe.Sizeof(*param))))
+		} else {
+			if unsafe.Pointer(param.name) != nullPtr {
+				if n := C.GoString(param.name); n != p.Name {
+					log.Println("Warning: the pre-allocated parameter name does not match the parameter name, it will be freed.", n, p.Name)
+					C.free(unsafe.Pointer(param.name))
+				} else {
+					nameReady = true
+				}
+			}
+
+			if unsafe.Pointer(param.content) != nullPtr {
+				if int(param.content_len) == len(p.Content) {
+					contentAllocated = true
+				} else {
+					log.Println("Warning: the pre-allocated content len does not match parameter content len, the pre-allocated content will be freed.", param.content_len, len(p.Content))
+					C.free(unsafe.Pointer(param.content))
+				}
+			}
+		}
+
+		if !nameReady {
+			param.name = C.CString(p.Name)
+		}
+		if !contentAllocated {
+			param.content = (*C.uchar)(C.malloc(C.size_t(len(p.Content))))
+		}
+		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
+		param.content_len = C.int(len(p.Content))
+		param.element_type = C.paddle_element_type(p.ElementType)
+	}
+
+	return 0
+}
+
+//export paddle_save_model
+func paddle_save_model(client C.client, path *C.char) C.int {
+	p := C.GoString(path)
+	c := get(client)
+	err := c.SaveModel(p)
+	if err != nil {
+		log.Println(err)
+		return -1
+	}
+
+	return 0
+}
+
+func main() {} // Required but ignored
diff --git a/paddle/go/cclient/cmake/CMakeDetermineGoCompiler.cmake b/paddle/go/cclient/cmake/CMakeDetermineGoCompiler.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..b3f8fbe271d80aaa72d90d167a0d8130bec7f362
--- /dev/null
+++ b/paddle/go/cclient/cmake/CMakeDetermineGoCompiler.cmake
@@ -0,0 +1,44 @@
+if(NOT CMAKE_Go_COMPILER)
+  if(NOT $ENV{GO_COMPILER} STREQUAL "")
+    get_filename_component(CMAKE_Go_COMPILER_INIT $ENV{GO_COMPILER} PROGRAM PROGRAM_ARGS CMAKE_Go_FLAGS_ENV_INIT)
+
+    if(CMAKE_Go_FLAGS_ENV_INIT)
+      set(CMAKE_Go_COMPILER_ARG1 "${CMAKE_Go_FLAGS_ENV_INIT}" CACHE STRING "First argument to Go compiler")
+    endif()
+
+    if(NOT EXISTS ${CMAKE_Go_COMPILER_INIT})
+      message(SEND_ERROR "Could not find compiler set in environment variable GO_COMPILER:\n$ENV{GO_COMPILER}.")
+    endif()
+
+  endif()
+
+  set(Go_BIN_PATH
+    $ENV{GOPATH}
+    $ENV{GOROOT}
+    $ENV{GOROOT}/../bin
+    $ENV{GO_COMPILER}
+    /usr/bin
+    /usr/local/bin
+    )
+
+  if(CMAKE_Go_COMPILER_INIT)
+    set(CMAKE_Go_COMPILER ${CMAKE_Go_COMPILER_INIT} CACHE PATH "Go Compiler")
+  else()
+    find_program(CMAKE_Go_COMPILER
+      NAMES go
+      PATHS ${Go_BIN_PATH}
+    )
+    EXEC_PROGRAM(${CMAKE_Go_COMPILER} ARGS version OUTPUT_VARIABLE GOLANG_VERSION)
+    STRING(REGEX MATCH "go[0-9]+.[0-9]+.[0-9]+[ /A-Za-z0-9]*" VERSION "${GOLANG_VERSION}")
+    message("-- The Golang compiler identification is ${VERSION}")
+    message("-- Check for working Golang compiler: ${CMAKE_Go_COMPILER}")
+  endif()
+
+endif()
+
+mark_as_advanced(CMAKE_Go_COMPILER)
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/CMakeGoCompiler.cmake.in
+  ${CMAKE_PLATFORM_INFO_DIR}/CMakeGoCompiler.cmake @ONLY)
+
+set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/paddle/go/cclient/cmake/CMakeGoCompiler.cmake.in b/paddle/go/cclient/cmake/CMakeGoCompiler.cmake.in
new file mode 100644
index 0000000000000000000000000000000000000000..a71f08e064656fbaad8cfa77aea6f216515712ef
--- /dev/null
+++ b/paddle/go/cclient/cmake/CMakeGoCompiler.cmake.in
@@ -0,0 +1,8 @@
+set(CMAKE_Go_COMPILER "@CMAKE_Go_COMPILER@")
+set(CMAKE_Go_COMPILER_LOADED 1)
+
+set(CMAKE_Go_SOURCE_FILE_EXTENSIONS go)
+set(CMAKE_Go_LINKER_PREFERENCE 40)
+set(CMAKE_Go_OUTPUT_EXTENSION .o)
+set(CMAKE_Go_OUTPUT_EXTENSION_REPLACE 1)
+set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/paddle/go/cclient/cmake/CMakeGoInformation.cmake b/paddle/go/cclient/cmake/CMakeGoInformation.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ba51ac93fcd429478f324b66bd5129d94ea2a8f4
--- /dev/null
+++ b/paddle/go/cclient/cmake/CMakeGoInformation.cmake
@@ -0,0 +1,7 @@
+if(NOT CMAKE_Go_COMPILE_OBJECT)
+  set(CMAKE_Go_COMPILE_OBJECT "go tool compile -l -N -o <OBJECT> <SOURCE> ")
+endif()
+
+if(NOT CMAKE_Go_LINK_EXECUTABLE)
+  set(CMAKE_Go_LINK_EXECUTABLE "go tool link -o <TARGET> <OBJECTS>  ")
+endif()
diff --git a/paddle/go/cclient/cmake/CMakeTestGoCompiler.cmake b/paddle/go/cclient/cmake/CMakeTestGoCompiler.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..b9891b015baced05b51e34dba562fd98a84fe14c
--- /dev/null
+++ b/paddle/go/cclient/cmake/CMakeTestGoCompiler.cmake
@@ -0,0 +1 @@
+set(CMAKE_Go_COMPILER_WORKS 1 CACHE INTERNAL "")
diff --git a/paddle/go/cclient/cmake/flags.cmake b/paddle/go/cclient/cmake/flags.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..062d5ab660dad2327d9f514f22c2868cc0f161a7
--- /dev/null
+++ b/paddle/go/cclient/cmake/flags.cmake
@@ -0,0 +1,45 @@
+# Setting Paddle Compile Flags
+include(CheckCXXCompilerFlag)
+include(CheckCCompilerFlag)
+include(CheckCXXSymbolExists)
+include(CheckTypeSize)
+
+function(CheckCompilerCXX11Flag)
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
+            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
+        endif()
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
+        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
+        # https://gist.github.com/yamaya/2924292
+        if(APPLE)  # cmake < 3.0 compiler id "Clang" on Mac OS X
+            if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
+                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
+            endif()
+        else()
+            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
+                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
+            endif()
+        endif()   
+    endif()
+endfunction()
+
+CheckCompilerCXX11Flag()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+# Common gpu architectures: Kepler, Maxwell
+foreach(capability 30 35 50)
+      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
+endforeach()
+
+if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
+      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
+endif()
+
+# Modern gpu architectures: Pascal
+if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
+      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
+endif()
+
+set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
\ No newline at end of file
diff --git a/paddle/go/cclient/cmake/golang.cmake b/paddle/go/cclient/cmake/golang.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..5d39868bfdfbfbeb88861c7829b6485589993052
--- /dev/null
+++ b/paddle/go/cclient/cmake/golang.cmake
@@ -0,0 +1,46 @@
+set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
+file(MAKE_DIRECTORY ${GOPATH})
+
+function(ExternalGoProject_Add TARG)
+  add_custom_target(${TARG} env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get ${ARGN})
+endfunction(ExternalGoProject_Add)
+
+function(add_go_executable NAME)
+  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
+  add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp 
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
+    -o "${CMAKE_CURRENT_BINARY_DIR}/${NAME}"
+    ${CMAKE_GO_FLAGS} ${GO_SOURCE}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+
+  add_custom_target(${NAME} ALL DEPENDS ${OUTPUT_DIR}/.timestamp ${ARGN})
+  install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${NAME} DESTINATION bin)
+endfunction(add_go_executable)
+
+
+function(ADD_GO_LIBRARY NAME BUILD_TYPE)
+  if(BUILD_TYPE STREQUAL "STATIC")
+    set(BUILD_MODE -buildmode=c-archive)
+    set(LIB_NAME "lib${NAME}.a")
+  else()
+    set(BUILD_MODE -buildmode=c-shared)
+    if(APPLE)
+      set(LIB_NAME "lib${NAME}.dylib")
+    else()
+      set(LIB_NAME "lib${NAME}.so")
+    endif()
+  endif()
+
+  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
+  add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
+    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
+    ${CMAKE_GO_FLAGS} ${GO_SOURCE}
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
+
+  add_custom_target(${NAME} ALL DEPENDS ${OUTPUT_DIR}/.timestamp ${ARGN})
+
+  if(NOT BUILD_TYPE STREQUAL "STATIC")
+    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME} DESTINATION bin)
+  endif()
+endfunction(ADD_GO_LIBRARY)
diff --git a/paddle/go/cclient/test/CMakeLists.txt b/paddle/go/cclient/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c899bd275d37bc28d4aefb2476310581a57e5bb4
--- /dev/null
+++ b/paddle/go/cclient/test/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.0)
+
+include_directories(/env/gopath/src/github.com/PaddlePaddle/Paddle/paddle/go/cclient/build/)
+
+add_executable(main main.c)
+add_dependencies(main client)
+set (CMAKE_EXE_LINKER_FLAGS "-pthread")
+target_link_libraries(main /env/gopath/src/github.com/PaddlePaddle/Paddle/paddle/go/cclient/build/libclient.a) # ${GTEST_LIBRARIES})
diff --git a/paddle/go/cclient/test/main.c b/paddle/go/cclient/test/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..28e3d03b7a000d3251a8d525ce50ca664eff3424
--- /dev/null
+++ b/paddle/go/cclient/test/main.c
@@ -0,0 +1,69 @@
+#include "libclient.h"
+
+//#include "gtest/gtest.h"
+
+void panic() {
+  // TODO(helin): fix: gtest using cmake is not working, using this
+  // hacky way for now.
+  *(void*)0;
+}
+
+int main() {
+  char addr[] = "localhost:3000";
+  client c = paddle_new_pserver_client(addr);
+retry:
+  if (paddle_begin_init_params(c, NULL, 0)) {
+    paddle_parameter param;
+    char name_a[] = "param_a";
+    char name_b[] = "param_b";
+    char content[] = {0x00, 0x11, 0x22};
+    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+    param.name = name_a;
+    param.content = content;
+    param.content_len = 3;
+    if (paddle_init_param(c, param, NULL, 0) != 0) {
+      goto retry;
+    }
+    param.element_type = PADDLE_ELEMENT_TYPE_INT32;
+    param.name = name_b;
+    param.content = content;
+    param.content_len = 3;
+    if (paddle_init_param(c, param, NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_finish_init_params(c) != 0) {
+      goto retry;
+    }
+  } else {
+    panic();
+  }
+
+  char content[] = {0x00, 0x11, 0x22};
+  paddle_gradient grads[2] = {
+      {"param_a", PADDLE_ELEMENT_TYPE_INT32, content, 3},
+      {"param_b", PADDLE_ELEMENT_TYPE_FLOAT32, content, 3}};
+
+  if (!paddle_send_grads(c, grads, 2)) {
+    panic();
+  }
+
+  paddle_parameter* params[2] = {NULL, NULL};
+  char* names[] = {"param_a", "param_b"};
+  if (!paddle_get_params(c, names, params, 2)) {
+    panic();
+  }
+
+  // get parameters again by reusing the allocated parameter buffers.
+  if (!paddle_get_params(c, names, params, 2)) {
+    panic();
+  }
+
+  paddle_release_param(params[0]);
+  paddle_release_param(params[1]);
+
+  if (!paddle_save_model(c, "/tmp/")) {
+    panic();
+  }
+
+  return 0;
+}
diff --git a/paddle/go/pserver/client.go b/paddle/go/pserver/client.go
new file mode 100644
index 0000000000000000000000000000000000000000..5b110af648da684c945ba6bfda3d50fa9a295773
--- /dev/null
+++ b/paddle/go/pserver/client.go
@@ -0,0 +1,83 @@
+package pserver
+
+// ElementType is the type of elements of a Parameter.
+type ElementType int
+
+// Supported element types
+const (
+	Int32 ElementType = iota
+	UInt32
+	Int64
+	UInt64
+	Float32
+	Float64
+)
+
+// Parameter is a piece of data to sync with the parameter server.
+type Parameter struct {
+	Name        string
+	ElementType ElementType
+	Content     []byte
+}
+
+// ParameterWithConfig contains the parameter and the configuration.
+type ParameterWithConfig struct {
+	Param  Parameter
+	Config []byte // parameter configuration in Proto Buffer format
+}
+
+// Gradient is the gradient of the parameter.
+type Gradient Parameter
+
+// Client is the client to parameter servers.
+type Client struct {
+}
+
+// NewClient creates a new client.
+func NewClient(addr string) *Client {
+	return &Client{}
+}
+
+// BeginInitParams begins to initialize parameters on parameter
+// servers.
+//
+// BeginInitParams will be called from multiple trainers, only one
+// trainer will be selected to initialize the parameters on parameter
+// servers. Other trainers will be blocked until the initialization is
+// done, and they need to get the initialized parameters from
+// parameter servers using GetParams.
+func (c *Client) BeginInitParams(pserverConfigProto []byte) (selected bool, err error) {
+	return true, nil
+}
+
+// InitParam initializes the parameter on parameter servers.
+func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error {
+	return nil
+}
+
+// FinishInitParams tells parameter servers client has sent all
+// parameters to parameter servers as initialization.
+func (c *Client) FinishInitParams() error {
+	return nil
+}
+
+// SendGrads sends gradients to parameter servers for updating
+// parameters.
+func (c *Client) SendGrads(grads []Gradient) error {
+	return nil
+}
+
+// GetParams gets parameters from parameter servers.
+func (c *Client) GetParams(names []string) ([]Parameter, error) {
+	return nil, nil
+}
+
+// SaveModel indicates parameters to save the parameter to the given
+// path.
+func (c *Client) SaveModel(path string) error {
+	return nil
+}
+
+// Cleanup cleans up the client states.
+func (c *Client) Cleanup() {
+}
diff --git a/paddle/majel/CMakeLists.txt b/paddle/majel/CMakeLists.txt
index d4bce38906e9326992f6a44ac5cf25309063806a..cb8bece00e1c77a8ffacf4cf8cc7a73fd508fbd3 100644
--- a/paddle/majel/CMakeLists.txt
+++ b/paddle/majel/CMakeLists.txt
@@ -1,4 +1,5 @@
-cc_library(majel SRCS place.cc)
+cc_library(place SRCS place.cc)
+cc_library(ddim SRCS ddim.cc)
 
 if(WITH_TESTING)
     add_subdirectory(test)
diff --git a/paddle/majel/README.md b/paddle/majel/README.md
index 2573738b66b2bf514d06358262ef941e833daf0f..7a80816d8e4ffa3a9462f3d9b87eff0f048466aa 100644
--- a/paddle/majel/README.md
+++ b/paddle/majel/README.md
@@ -106,34 +106,84 @@ Because `variant` may be thought of as "multi-type, single value", we can utiliz
  arr[make_ddim({0, 1})] = 1.0；
  ```
 
-## implement Tensor in Paddle
+## Implement Tensor in Paddle
+
+We want to create a Tensor class to replace Vector and Matrix, and to support high-dimensional data. The operations on Tensor are implemented in both CPU and GPU. We also want to make sure that the Tensor interface is friendly to its callers.
+
+Tensor is only responsible for describing computing. It will not take charge of memory allocation policy, handles of some CUDA library context(e.g. cublasHandle, cudnnHandle), and dispatching CUDA kernels. Paddle has realize the initialization and resources management of hardware.
 
 Before writing code, please make sure you already look through Majel Source Code and grabbed the design philosophy of `DArray` in Majel.
 
-To assign subtasks to our colleagues, we have to discuss how to divide it to independent subtasks.
 
-- [ ] 1. First, we need to consider the third-party dependencies in Majel.
+### Memory Management
+`Allocation` manages a block of memory in device(CPU/GPU). We use `Place` to decribe memory location. The details of memory allocation and deallocation are implememted in `Allocator` and `DeAllocator`. Related low-level API such as `hl_malloc_device()` and `hl_malloc_host()` are provided by Paddle.
+
+### Dim and Array
+#### Dim
+
+`Dim` decribes the dimension information of an array.
 
-    Majel heavily use `boost.variant`, but we don't want to integrate `boost` into PaddlePaddle. It's better to replace boost using the lightweight implementation. https://github.com/mapbox/variant Mapbox variant has the same speedy performance of `boost::variant `but is faster to compile, results in smaller binaries, and has no dependencies.
+`DDimVar` is an alias of a specializd class of boost.variant class template.
 
-> @gangliao
+`DDim` is introduced to represent a dynamically sized dimension.
+
+For example:
+
+```
+Dim<2> d1 = make_dim(3, 3);
+DDim d2 = make_ddim({1, 2, 3});
+```
 
-- [ ] 2. Re-implement `Place` and `Allocation/Memory`
+You must appoint a concrete sized dimension to Dim, whereas DDim can represent a dynamically sized dimension.
+#### Array
+
+`Array` represents for a tensor with specific type and size.
+
+`DArrarVar` is an alias of a specialized class of boost.variant class template.
+
+`DArray` is introduced to represent a dynamically typed array.
+
+For example:
+
+```
+Array<float, 2> a1(Dim<2>(2, 2));
+DArray a2 = make_darray(make_ddim({3, 4}), 0.0, CpuPlace());
+```
 
-    I found @wangkuiyi submitted a pull request includes `Place`. @gangliao and @qijun could re-implement `Allocation`, because we have the GPU development experience before joining Paddle team.
+You must appoint the type and dimension of a Array, whereas DArray can represent a dynanmically typed array.
 
-> @wangkuiyi @gangliao @qijun
 
-- [ ] 3. Re-implement `Dim`.
+Please reference the section of `Learn from Majel` for more details.
 
-    `Dim` is an excellent implementation in Majel.
+### ArrayView
 
-> ???
+`ViewIterator` is a class template which implements basic iterator operation, including increment(++), decrement(--), dereference(*), equality comparisons(==) and so on.
 
-- [ ] 4. Re-implement `Array/Tensor`.
+`ArrayView` is an encapsulation of `Array`， which introduces extra iterator methods, such as `begin()` and `end()`. The `begin()` method returns an iterator pointing to the first element in the ArrayView. And the `end()` method returns an iterator pointing to the pass-the-end element in the ArrayView.
+
+`ArrayView` make the visting and manipulating an array more efficiently, flexibly and safely.
+
+
+A global function `make_view` is provided to transform an array to corresponding arrayview.
+
+```
+template<typename T, int D>
+ArrayView<T, D> make_view(const Array<T, D>& in) {
+    return in;
+}
+```
+
+A global function `make_iterator` is provided to make iterator of an array.
+
+```
+template<typename T, int D>
+ViewIterator<ArrayView<T, D>> make_iterator(const Array<T, D>& in, Dim<D> idx) {
+    return make_iterator(make_view(in), idx);
+}
+```
 
-> Prerequisites: 1 - 3
+### Basic Operations
 
-- [ ] 5. Re-implement fundamental operators for `Array/Tensor`.
+The operations that manipulate DArray are defined as global functions, such as `ones`, `zeros`, `reshape`, `gemm` and so on.
 
-> Prerequisites: 1 - 4
+An array will be trasformed into an arrayview and then passed to the operation launching on a specific device(CPU/GPU).
diff --git a/paddle/majel/ddim.cc b/paddle/majel/ddim.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f32408ed53074234873ec0ea8ee7f4e449e5e908
--- /dev/null
+++ b/paddle/majel/ddim.cc
@@ -0,0 +1,222 @@
+#include "paddle/majel/ddim.h"
+
+namespace majel {
+
+///@cond HIDDEN
+
+template <int i>
+Dim<i> make_dim(const int* d) {
+  return Dim<i>(*d, make_dim<i - 1>(d + 1));
+}
+
+template <>
+Dim<1> make_dim<1>(const int* d) {
+  return Dim<1>(*d);
+}
+
+void make_ddim(DDim& ddim, const int* dims, int n) {
+  switch (n) {
+    case 1:
+      ddim = make_dim<1>(dims);
+      break;
+    case 2:
+      ddim = make_dim<2>(dims);
+      break;
+    case 3:
+      ddim = make_dim<3>(dims);
+      break;
+    case 4:
+      ddim = make_dim<4>(dims);
+      break;
+    case 5:
+      ddim = make_dim<5>(dims);
+      break;
+    case 6:
+      ddim = make_dim<6>(dims);
+      break;
+    case 7:
+      ddim = make_dim<7>(dims);
+      break;
+    case 8:
+      ddim = make_dim<8>(dims);
+      break;
+    case 9:
+      ddim = make_dim<9>(dims);
+      break;
+    default:
+      throw std::invalid_argument(
+          "Dynamic dimensions must have between [1, 9] dimensions.");
+  }
+}
+
+///@endcond
+
+DDim make_ddim(std::initializer_list<int> dims) {
+  DDim result(make_dim(0));
+  make_ddim(result, dims.begin(), dims.size());
+  return result;
+}
+
+DDim make_ddim(const std::vector<int>& dims) {
+  DDim result(make_dim(0));
+  make_ddim(result, &dims[0], dims.size());
+  return result;
+}
+
+///@cond HIDDEN
+// XXX For some reason, putting this in an anonymous namespace causes errors
+class DynamicMutableIndexer : public boost::static_visitor<int&> {
+public:
+  DynamicMutableIndexer(int idx) : idx_(idx) {}
+
+  template <int D>
+  int& operator()(Dim<D>& dim) const {
+    return dim[idx_];
+  }
+
+private:
+  int idx_;
+};
+
+class DynamicConstIndexer : public boost::static_visitor<int> {
+public:
+  DynamicConstIndexer(int idx) : idx_(idx) {}
+
+  template <int D>
+  int operator()(const Dim<D>& dim) const {
+    return dim[idx_];
+  }
+
+private:
+  int idx_;
+};
+
+///@endcond
+
+int& DDim::operator[](int idx) {
+  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
+}
+
+int DDim::operator[](int idx) const {
+  return boost::apply_visitor(DynamicConstIndexer(idx), var);
+}
+
+bool DDim::operator==(DDim d) const {
+  if (var.which() != d.getVar().which()) {
+    return false;
+  } else {
+    std::vector<int> v1 = vectorize(*this);
+    std::vector<int> v2 = vectorize(d);
+
+    for (unsigned int i = 0; i < v1.size(); i++) {
+      if (v1[i] != v2[i]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+}
+
+bool DDim::operator!=(DDim d) const { return !(*this == d); }
+
+DDim DDim::operator+(DDim d) const {
+  std::vector<int> v1 = vectorize(*this);
+  std::vector<int> v2 = vectorize(d);
+
+  std::vector<int> v3;
+
+  assert(v1.size() == v2.size());
+
+  for (unsigned int i = 0; i < v1.size(); i++) {
+    v3.push_back(v1[i] + v2[i]);
+  }
+
+  return make_ddim(v3);
+}
+
+DDim DDim::operator*(DDim d) const {
+  std::vector<int> v1 = vectorize(*this);
+  std::vector<int> v2 = vectorize(d);
+
+  std::vector<int> v3;
+
+  assert(v1.size() == v2.size());
+
+  for (unsigned int i = 0; i < v1.size(); i++) {
+    v3.push_back(v1[i] * v2[i]);
+  }
+
+  return make_ddim(v3);
+}
+
+int get(const DDim& ddim, int idx) { return ddim[idx]; }
+
+void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
+
+///@cond HIDDEN
+struct VectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int>& vector;
+
+  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+
+  template <typename T>
+  void operator()(const T& t) {
+    vector.push_back(t.head);
+    this->operator()(t.tail);
+  }
+
+  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
+};
+///@endcond
+
+std::vector<int> vectorize(const DDim& ddim) {
+  std::vector<int> result;
+  VectorizeVisitor visitor(result);
+  boost::apply_visitor(visitor, ddim);
+  return result;
+}
+
+ssize_t product(const DDim& ddim) {
+  ssize_t result = 1;
+  std::vector<int> v = vectorize(ddim);
+  for (auto i : v) {
+    result *= i;
+  }
+  return result;
+}
+
+///\cond HIDDEN
+
+struct ArityVisitor : boost::static_visitor<int> {
+  template <int D>
+  int operator()(Dim<D>) const {
+    return D;
+  }
+};
+
+///\endcond
+
+int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
+
+///\cond HIDDEN
+
+struct DDimPrinter : boost::static_visitor<void> {
+  std::ostream& os;
+  DDimPrinter(std::ostream& os_) : os(os_) {}
+
+  template <typename T>
+  void operator()(const T& t) {
+    os << t;
+  }
+};
+
+///\endcond
+
+std::ostream& operator<<(std::ostream& os, const majel::DDim& ddim) {
+  DDimPrinter printer(os);
+  boost::apply_visitor(printer, ddim);
+  return os;
+}
+
+}  // namespace majel
diff --git a/paddle/majel/ddim.h b/paddle/majel/ddim.h
new file mode 100644
index 0000000000000000000000000000000000000000..7be756f8c098ba5aa3a5ff4380c90f4b90a55bb7
--- /dev/null
+++ b/paddle/majel/ddim.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <boost/variant.hpp>
+#include <initializer_list>
+#include <stdexcept>
+#include <vector>
+
+#include "paddle/majel/dim.h"
+
+namespace majel {
+
+namespace {
+typedef boost::variant<Dim<1>,
+                       Dim<2>,
+                       Dim<3>,
+                       Dim<4>,
+                       Dim<5>,
+                       Dim<6>,
+                       Dim<7>,
+                       Dim<8>,
+                       Dim<9>>
+    DDimVar;
+}
+
+/**
+ * \brief A dynamically sized dimension.
+ *
+ * The number of dimensions must be between [1, 9].
+ */
+struct DDim {
+  DDimVar var;
+
+  DDim() : var(Dim<1>()) {}
+
+  template <int D>
+  DDim(const Dim<D>& in) : var(in) {}
+
+  template <int D>
+  DDim& operator=(const Dim<D>& in) {
+    var = in;
+    return *this;
+  }
+
+  int& operator[](int idx);
+  int operator[](int idx) const;
+
+  template <typename Visitor>
+  typename Visitor::result_type apply_visitor(Visitor& visitor) {
+    return var.apply_visitor(visitor);
+  }
+
+  template <typename Visitor>
+  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
+    return var.apply_visitor(visitor);
+  }
+
+  DDimVar getVar() { return var; }
+
+  bool operator==(DDim d) const;
+
+  bool operator!=(DDim d) const;
+
+  DDim operator+(DDim d) const;
+
+  DDim operator*(DDim d) const;
+};
+
+/**
+ * \brief Make a DDim from std::vector<int>
+ *
+ * \param dims An vector of ints. Must be sized between [1, 9]
+ */
+DDim make_ddim(const std::vector<int>& dims);
+
+/**
+ * \brief Make a DDim from an initializer list
+ *
+ * \param dims An initializer list of ints. Must be sized between [1, 9]
+ *
+ */
+DDim make_ddim(std::initializer_list<int> dims);
+
+int get(const DDim& dim, int idx);
+void set(DDim& dim, int idx, int val);
+
+std::vector<int> vectorize(const DDim& ddim);
+
+ssize_t product(const DDim& ddim);
+
+/**
+ * \brief What is the length of this dimension?
+ *
+ * \param Dynamic dimension to inspect
+ */
+
+int arity(const DDim& ddim);
+
+std::ostream& operator<<(std::ostream&, const majel::DDim&);
+
+}  // namespace majel
+
+namespace boost {
+
+template <typename T>
+T get(const majel::DDim& in) {
+  return boost::get<T>(in.var);
+}
+
+}  // namespace boost
diff --git a/paddle/majel/detail/cuda_assert.h b/paddle/majel/detail/cuda_assert.h
new file mode 100644
index 0000000000000000000000000000000000000000..9490d0ae3eff01bdb4403de710b7bfd878e87f03
--- /dev/null
+++ b/paddle/majel/detail/cuda_assert.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
+#include <stdio.h>
+#define MAJEL_ASSERT(e)                                                       \
+  do {                                                                        \
+    if (!(e)) {                                                               \
+      printf(                                                                 \
+          "%s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, TOSTRING(e)); \
+      asm("trap;");                                                           \
+    }                                                                         \
+  } while (0)
+
+#define MAJEL_ASSERT_MSG(e, m)                      \
+  do {                                              \
+    if (!(e)) {                                     \
+      printf("%s:%d Assertion `%s` failed (%s).\n", \
+             __FILE__,                              \
+             __LINE__,                              \
+             TOSTRING(e),                           \
+             m);                                    \
+      asm("trap;");                                 \
+    }                                               \
+  } while (0)
+#else
+#include <assert.h>
+#define MAJEL_ASSERT(e) assert(e)
+#define MAJEL_ASSERT_MSG(e, m) assert((e) && (m))
+#endif
diff --git a/paddle/majel/detail/hostdevice.h b/paddle/majel/detail/hostdevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7de86b7b2f75d206e730ec409bbee5d0a08942e
--- /dev/null
+++ b/paddle/majel/detail/hostdevice.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#ifdef __CUDACC__
+#define HOSTDEVICE __host__ __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define HOST
+#endif
diff --git a/paddle/majel/dim.h b/paddle/majel/dim.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4b0c6aea683384d4657dd5db6f419b9e1108704
--- /dev/null
+++ b/paddle/majel/dim.h
@@ -0,0 +1,451 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <type_traits>
+
+#include "paddle/majel/detail/cuda_assert.h"
+#include "paddle/majel/detail/hostdevice.h"
+
+namespace majel {
+
+// Statically sized, statically indexed dimension
+template <int i>
+struct Dim {
+  static constexpr int dimensions = i;
+
+  template <typename... Args>
+  HOSTDEVICE Dim(int _head, Args... _tail) : head(_head), tail(_tail...) {
+    static_assert(sizeof...(_tail) == i - 1,
+                  "Dim initialized with the wrong number of parameters");
+  }
+
+  HOSTDEVICE
+  Dim(int _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+
+  HOSTDEVICE
+  Dim() : head(0), tail() {}
+
+  /** Construct a Dim from a linear index and size.  Uses Fortran order
+   * indexing. */
+  HOSTDEVICE
+  Dim(int idx, const Dim<i>& size)
+      : head(idx % size.head), tail(idx / size.head, size.tail) {}
+
+  /** Construct a Dim with each dimension set to the given index */
+  HOSTDEVICE
+  Dim(int idx) : head(idx), tail(idx) {}
+
+  HOSTDEVICE
+  bool operator==(const Dim<i>& o) const {
+    return (head == o.head) && (tail == o.tail);
+  }
+
+  HOSTDEVICE
+  bool operator!=(const Dim<i>& o) const { return !(*this == o); }
+
+  HOSTDEVICE
+  int& operator[](int idx);
+  HOSTDEVICE
+  int operator[](int idx) const;
+
+  HOST std::string to_string() const;
+
+  int head;
+  Dim<i - 1> tail;
+};
+
+// Base case specialization
+template <>
+struct Dim<1> {
+  static constexpr int dimensions = 1;
+
+  HOSTDEVICE
+  Dim(int _head) : head(_head) {}
+
+  HOSTDEVICE
+  Dim() : head(0) {}
+
+  HOSTDEVICE
+  Dim(int idx, const Dim<1>& size) : head(idx) {
+#ifndef __CUDA_ARCH__
+    if (idx >= size.head) {
+      throw std::invalid_argument("Index out of range.");
+    }
+#else
+    MAJEL_ASSERT(idx < size.head);
+#endif
+  }
+
+  HOSTDEVICE
+  bool operator==(const Dim<1>& o) const { return (head == o.head); }
+
+  HOSTDEVICE
+  bool operator!=(const Dim<1>& o) const { return !(*this == o); }
+
+  HOSTDEVICE
+  int& operator[](int idx);
+  HOSTDEVICE
+  int operator[](int idx) const;
+
+  int head;
+};
+
+namespace {
+
+// Helper for accessing Dim classes
+template <int i>
+struct DimGetter {
+  // Return a copy if Dim is const
+  template <typename D>
+  HOSTDEVICE static int impl(const D& d) {
+    return DimGetter<i - 1>::impl(d.tail);
+  }
+  // Return a reference if Dim is mutable
+  template <typename D>
+  HOSTDEVICE static int& impl(D& d) {
+    return DimGetter<i - 1>::impl(d.tail);
+  }
+};
+
+// Eureka! We found the element!
+template <>
+struct DimGetter<0> {
+  // Return a copy if Dim is const
+  template <typename D>
+  HOSTDEVICE static int impl(const D& d) {
+    return d.head;
+  }
+  // Return a reference if Dim is mutable
+  template <typename D>
+  HOSTDEVICE static int& impl(D& d) {
+    return d.head;
+  }
+};
+
+template <int D>
+HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx < 0) {
+    throw std::invalid_argument("Tried to access a negative dimension");
+  }
+#else
+  MAJEL_ASSERT(idx >= 0);
+#endif
+  if (idx == 0) {
+    return dim.head;
+  }
+  return indexer(dim.tail, idx - 1);
+}
+
+template <>
+HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx != 0) {
+    throw std::invalid_argument("Invalid index");
+  }
+#else
+  MAJEL_ASSERT(idx == 0);
+#endif
+  return dim.head;
+}
+
+template <int D>
+HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx < 0) {
+    throw std::invalid_argument("Tried to access a negative dimension");
+  }
+#else
+  MAJEL_ASSERT(idx >= 0);
+#endif
+  if (idx == 0) {
+    return dim.head;
+  }
+  return indexer(dim.tail, idx - 1);
+}
+
+template <>
+HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
+#ifndef __CUDA_ARCH__
+  if (idx != 0) {
+    throw std::invalid_argument("Invalid index");
+  }
+#else
+  MAJEL_ASSERT(idx == 0);
+#endif
+  return dim.head;
+}
+
+}  // namespace
+// Static access to constant Dim
+template <int i, int l>
+HOSTDEVICE int get(const Dim<l>& d) {
+  return DimGetter<i>::impl(d);
+}
+
+// Static access to mutable Dim
+template <int i, int l>
+HOSTDEVICE int& get(Dim<l>& d) {
+  return DimGetter<i>::impl(d);
+}
+
+// Dynamic access to constant Dim
+template <int l>
+HOSTDEVICE int Dim<l>::operator[](int i) const {
+  return indexer(*this, i);
+}
+
+// Dynamic access to mutable Dim
+template <int l>
+HOSTDEVICE int& Dim<l>::operator[](int i) {
+  return indexer(*this, i);
+}
+
+// Dynamic access to constant Dim
+inline HOSTDEVICE int Dim<1>::operator[](int i) const {
+  return indexer(*this, i);
+}
+
+// Dynamic access to mutable Dim
+inline HOSTDEVICE int& Dim<1>::operator[](int i) { return indexer(*this, i); }
+
+// Dynamic access to constant Dim
+// without std::enable_if will try to instantiate this on get<0>(d)
+template <int l>
+HOSTDEVICE typename std::enable_if<(l > 0), int>::type get(const Dim<l>& d,
+                                                           int i) {
+  return d[i];
+}
+
+// Dynamic access to mutable Dim
+template <int l>
+HOSTDEVICE typename std::enable_if<(l > 0), int&>::type get(Dim<l>& d, int i) {
+  return d[i];
+}
+
+// Dot product of two dims
+template <int i>
+HOSTDEVICE int linearize(const Dim<i>& a, const Dim<i>& b) {
+  return a.head * b.head + linearize(a.tail, b.tail);
+}
+
+// Base case dot product of two Dims
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline int linearize(const Dim<1>& a, const Dim<1>& b) {
+  return a.head * b.head;
+}
+
+// Product of a Dim
+template <int i>
+HOSTDEVICE int product(const Dim<i>& a, int prod = 1) {
+  return prod * a.head * product(a.tail);
+}
+
+// Base case product of a Dim
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline int product(const Dim<1>& a, int prod) {
+  return prod * a.head;
+}
+
+// Is 0 <= idx_i < size_i for all i?
+template <int i>
+HOSTDEVICE bool contained(const Dim<i>& idx, const Dim<i>& size) {
+  return ((0 <= idx.head) && (idx.head < size.head) &&
+          contained(idx.tail, size.tail));
+}
+
+// Base case of is 0 <= idx_i < size_i ?
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
+  return ((0 <= idx.head) && (idx.head < size.head));
+}
+
+/**
+ * \brief Check if a size and a stride create a Fortran order contiguous
+ * block of memory.
+ */
+template <int i>
+HOST bool contiguous(const Dim<i>& size, const Dim<i>& stride, int mul = 1) {
+  if (product(size) == 0) return true;
+  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
+  return (get<0>(stride) == contiguous_stride &&
+          contiguous(size.tail, stride.tail, mul * get<0>(size)));
+}
+
+///\cond HIDDEN
+// Base case of contiguous, check the nth stride is the size of
+// the prefix multiply of n-1 dims.
+template <>
+inline bool contiguous(const Dim<1>& size, const Dim<1>& stride, int mul) {
+  if (get<0>(size) == 0) return true;
+  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
+  return get<0>(stride) == contiguous_stride;
+}
+///\endcond
+
+/**
+ * \brief Compute exclusive prefix-multiply of a Dim.
+ */
+template <int i>
+HOSTDEVICE Dim<i> ex_prefix_mul(const Dim<i>& src, int mul = 1) {
+  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
+}
+
+///\cond HIDDEN
+// Base case of ex_prefix_mul
+// Notice it is inline because it is no longer a template
+template <>
+HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
+  return Dim<1>(mul);
+}
+///\endcond
+
+/**
+ * \brief Calculate strides of a contiguous array of the given size
+ *
+ * Sets the stride for any dimension with an extent of 1 to 0.
+ * \param size Dim object containing the size of the array.
+ * \param base The base stride to use.
+ * \return Dim object the same size as \p size with the strides.
+ */
+template <int i>
+HOSTDEVICE Dim<i> contiguous_strides(const Dim<i>& size, int base = 1) {
+  int stride = size.head == 1 ? 0 : base;
+  return Dim<i>(stride, contiguous_strides(size.tail, base * size.head));
+}
+
+///\cond HIDDEN
+
+// Base case of contiguous_strides
+template <>
+HOSTDEVICE inline Dim<1> contiguous_strides(const Dim<1>& size, int base) {
+  int stride = size.head == 1 ? 0 : base;
+  return Dim<1>(stride);
+}
+
+///\endcond
+
+/**
+ * Add two dimensions together
+ */
+template <int i>
+HOSTDEVICE Dim<i> dim_plus(const Dim<i>& a, const Dim<i>& b) {
+  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
+}
+
+// Base case
+template <>
+HOSTDEVICE inline Dim<1> dim_plus(const Dim<1>& a, const Dim<1>& b) {
+  return Dim<1>(a.head + b.head);
+}
+
+template <int i>
+HOSTDEVICE Dim<i> operator+(const Dim<i>& lhs, const Dim<i>& rhs) {
+  return dim_plus(lhs, rhs);
+}
+
+/**
+ * Multiply two dimensions together
+ */
+template <int i>
+HOSTDEVICE Dim<i> dim_mult(const Dim<i>& a, const Dim<i>& b) {
+  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
+}
+
+// Base case
+template <>
+HOSTDEVICE inline Dim<1> dim_mult(const Dim<1>& a, const Dim<1>& b) {
+  return Dim<1>(a.head * b.head);
+}
+
+template <int i>
+HOSTDEVICE Dim<i> operator*(const Dim<i>& lhs, const Dim<i>& rhs) {
+  return dim_mult(lhs, rhs);
+}
+
+/**
+ * \brief Normalize strides to ensure any dimension with extent 1
+ * has stride 0.
+ *
+ * \param size Dim object containing the size of an array
+ * \param stride Dim object containing stride of an array
+ * \return Dim object the same size as \p size with normalized strides
+ *
+ */
+
+template <int i>
+HOSTDEVICE Dim<i> normalize_strides(const Dim<i>& size, const Dim<i>& stride) {
+  int norm_stride = size.head == 1 ? 0 : stride.head;
+  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
+}
+
+///\cond HIDDEN
+
+template <>
+HOSTDEVICE inline Dim<1> normalize_strides(const Dim<1>& size,
+                                           const Dim<1>& stride) {
+  int norm_stride = size.head == 1 ? 0 : stride.head;
+  return Dim<1>(norm_stride);
+}
+
+///\endcond
+
+/**
+ * Helper function to create a Dim
+ *
+ * \param idxes The type of Dim constructed depends on the number of params
+ *
+ */
+
+template <typename... Args>
+HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
+  return Dim<sizeof...(Args)>(idxes...);
+}
+
+// Allows us to output a Dim
+// XXX For some reason, overloading fails to resolve this correctly
+template <int i>
+typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
+    std::ostream& os, const majel::Dim<i>& d) {
+  os << d.head << ", " << d.tail;
+  return os;
+}
+
+// Base case that allows us to output a Dim
+// XXX I wish this could be an overload instead of a template
+template <int i>
+typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
+    std::ostream& os, const majel::Dim<i>& d) {
+  os << d.head;
+  return os;
+}
+
+template <int i>
+HOST std::string Dim<i>::to_string() const {
+  std::stringstream stream;
+
+  stream << *this;
+
+  return stream.str();
+}
+
+template <int D>
+HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
+  Dim<D> result;
+
+  for (int i = 0; i < D - 1; ++i) {
+    result[i] = linear_index % extents[i];
+    linear_index /= extents[i];
+  }
+
+  result[D - 1] = linear_index;
+
+  return result;
+}
+
+}  // namespace majel
diff --git a/paddle/majel/place.cc b/paddle/majel/place.cc
index eecd8e5b730704258d2bd7d98a75a0a80e13a797..ca50b37843e0ba047f8f8b8d24a3d3c131587382 100644
--- a/paddle/majel/place.cc
+++ b/paddle/majel/place.cc
@@ -1,4 +1,4 @@
-#include <majel/place.h>
+#include "paddle/majel/place.h"
 
 namespace majel {
 
@@ -16,7 +16,7 @@ public:
   void operator()(const GpuPlace& p) { os_ << "GpuPlace(" << p.device << ")"; }
 };
 
-}  // namespace majel
+}  // namespace detail
 
 static Place the_default_place;
 
diff --git a/paddle/majel/test/CMakeLists.txt b/paddle/majel/test/CMakeLists.txt
index 68f9059874aed8843da1fc598c7d2e57e9b8bbfe..9d632d568e9908c5a1505f23f43c8bd3daecff18 100644
--- a/paddle/majel/test/CMakeLists.txt
+++ b/paddle/majel/test/CMakeLists.txt
@@ -1,7 +1,12 @@
 cc_test(place_test
     SRCS place_test.cc
-    DEPS majel)
+    DEPS place)
+
+cc_test(ddim_test
+    SRCS ddim_test.cc
+    DEPS ddim)
 
 if(WITH_GPU)
     nv_test(cuda_test SRCS cuda_test.cu)
+    nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 endif()
diff --git a/paddle/majel/test/ddim_test.cc b/paddle/majel/test/ddim_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5b8a7c4d26740c1c4169547e76a0cf5558facc0
--- /dev/null
+++ b/paddle/majel/test/ddim_test.cc
@@ -0,0 +1,65 @@
+//#include <stdexcept>
+//#include <unittest/unittest.h>
+#include <sstream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/majel/ddim.h"
+
+TEST(DDim, Equality) {
+  // construct a DDim from an initialization list
+  majel::DDim ddim = majel::make_ddim({9, 1, 5});
+  EXPECT_EQ(ddim[0], 9);
+  EXPECT_EQ(ddim[1], 1);
+  EXPECT_EQ(ddim[2], 5);
+
+  // construct a DDim from a vector
+  std::vector<int> vec({9, 1, 5});
+  majel::DDim vddim = majel::make_ddim(vec);
+  EXPECT_EQ(ddim[0], 9);
+  EXPECT_EQ(ddim[1], 1);
+  EXPECT_EQ(ddim[2], 5);
+
+  // mutate a DDim
+  ddim[1] = 2;
+  EXPECT_EQ(ddim[1], 2);
+  majel::set(ddim, 0, 6);
+  EXPECT_EQ(majel::get(ddim, 0), 6);
+
+  // vectorize a DDim
+  std::vector<int> res_vec = majel::vectorize(vddim);
+  EXPECT_EQ(res_vec[0], 9);
+  EXPECT_EQ(res_vec[1], 1);
+  EXPECT_EQ(res_vec[2], 5);
+  majel::Dim<3> d(3, 2, 1);
+  res_vec = majel::vectorize(majel::DDim(d));
+  EXPECT_EQ(res_vec[0], 3);
+  EXPECT_EQ(res_vec[1], 2);
+  EXPECT_EQ(res_vec[2], 1);
+
+  // add two DDims
+  majel::DDim ddim_sum = ddim + vddim;
+  EXPECT_EQ(ddim_sum[0], 15);
+  EXPECT_EQ(ddim_sum[1], 3);
+  EXPECT_EQ(ddim_sum[2], 10);
+
+  // multiply two DDims
+  majel::DDim ddim_mul = ddim * vddim;
+  EXPECT_EQ(ddim_mul[0], 54);
+  EXPECT_EQ(ddim_mul[1], 2);
+  EXPECT_EQ(ddim_mul[2], 25);
+
+  // arity of a DDim
+  EXPECT_EQ(majel::arity(ddim), 3);
+
+  // product of a DDim
+  EXPECT_EQ(majel::product(vddim), 45);
+}
+
+TEST(DDim, Print) {
+  // print a DDim
+  std::stringstream ss;
+  majel::DDim ddim = majel::make_ddim({2, 3, 4});
+  ss << ddim;
+  EXPECT_EQ("2, 3, 4", ss.str());
+}
diff --git a/paddle/majel/test/dim_test.cu b/paddle/majel/test/dim_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a7d81e595bea7fa6326ea350e2702e1ef8f5caa4
--- /dev/null
+++ b/paddle/majel/test/dim_test.cu
@@ -0,0 +1,128 @@
+#include <thrust/device_vector.h>
+#include <sstream>
+
+#include "paddle/majel/dim.h"
+#include "gtest/gtest.h"
+
+__global__ void test(majel::Dim<2>* o) {
+    o[0] = majel::make_dim(5, 6);
+}
+
+__global__ void dyn_idx_gpu(int* o) {
+    auto d = majel::make_dim(5, 6);
+    o[0] = d[1];
+}
+
+TEST(Dim, Equality) {
+    // construct a Dim on the CPU
+    auto a = majel::make_dim(3, 4);
+    EXPECT_EQ(majel::get<0>(a), 3);
+    EXPECT_EQ(majel::get<1>(a), 4);
+
+    // construct a Dim on the GPU
+    thrust::device_vector<majel::Dim<2>> t(2);
+    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
+    a = t[0];
+    EXPECT_EQ(majel::get<0>(a), 5);
+    EXPECT_EQ(majel::get<1>(a), 6);
+
+    // linearization
+    auto b = majel::make_dim(7, 8);
+    EXPECT_EQ(majel::linearize(a, b), 83);
+
+    // product
+    EXPECT_EQ(majel::product(a), 30);
+
+    // mutate a Dim
+    majel::get<1>(b) = 10;
+    EXPECT_EQ(majel::get<0>(b), 7);
+    EXPECT_EQ(majel::get<1>(b), 10);
+
+    // dynamic access
+    majel::get(b, 0) = 8;
+    b[1] = 11;
+    EXPECT_EQ(majel::get<0>(b), 8);
+    EXPECT_EQ(majel::get<1>(b), 11);
+    EXPECT_EQ(majel::get(b, 0), 8);
+    EXPECT_EQ(b[1], 11);
+
+    // dynamic access on GPU
+    thrust::device_vector<int> r(1);
+    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
+    int res = r[0];
+    EXPECT_EQ(res, 6);
+
+    // ex_prefix_mul
+    majel::Dim<3> c = majel::ex_prefix_mul(majel::Dim<3>(3, 4, 5));
+    EXPECT_EQ(majel::get<0>(c), 1);
+    EXPECT_EQ(majel::get<1>(c), 3);
+    EXPECT_EQ(majel::get<2>(c), 12);
+
+    // contiguous_strides
+    c = majel::contiguous_strides(majel::Dim<3>(10, 1, 10));
+    EXPECT_EQ(majel::get<0>(c), 1);
+    EXPECT_EQ(majel::get<1>(c), 0);
+    EXPECT_EQ(majel::get<2>(c), 10);
+    c = majel::contiguous_strides(majel::Dim<3>(10, 10, 1));
+    EXPECT_EQ(majel::get<0>(c), 1);
+    EXPECT_EQ(majel::get<1>(c), 10);
+    EXPECT_EQ(majel::get<2>(c), 0);
+    c = majel::contiguous_strides(majel::Dim<3>(1, 10, 10));
+    EXPECT_EQ(majel::get<0>(c), 0);
+    EXPECT_EQ(majel::get<1>(c), 1);
+    EXPECT_EQ(majel::get<2>(c), 10);
+    c = majel::contiguous_strides(majel::Dim<3>(2, 3, 4));
+    EXPECT_EQ(majel::get<0>(c), 1);
+    EXPECT_EQ(majel::get<1>(c), 2);
+    EXPECT_EQ(majel::get<2>(c), 6);
+
+    // generate from an index
+    auto size = majel::make_dim(4, 5, 2);
+    c = majel::Dim<3>(14, size);
+    EXPECT_EQ(majel::get<0>(c), 2);
+    EXPECT_EQ(majel::get<1>(c), 3);
+    EXPECT_EQ(majel::get<2>(c), 0);
+    c = majel::Dim<3>(25, size);
+    EXPECT_EQ(majel::get<0>(c), 1);
+    EXPECT_EQ(majel::get<1>(c), 1);
+    EXPECT_EQ(majel::get<2>(c), 1);
+}
+
+TEST(Dim, Bool) {
+    auto a = majel::make_dim(3, 4);
+    auto b = majel::make_dim(5, 6);
+    auto c = majel::make_dim(3, 4);
+
+    // in_bounds check
+    EXPECT_TRUE(majel::contained(a, b));
+    EXPECT_FALSE(majel::contained(b, a));
+
+    // comparison
+    EXPECT_TRUE(a == a);
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a == c);
+
+    // contiguous check
+    int x = 4, y = 5, z = 2;
+    majel::Dim<3> sizef(x, y, z);
+    majel::Dim<3> stridea(1, x, x*y);
+    majel::Dim<3> strideb(2, 2*x, 2*x*y);
+    majel::Dim<3> stridec(1, x, 2*x*y);
+    EXPECT_TRUE(majel::contiguous(sizef, stridea));
+    EXPECT_FALSE(majel::contiguous(sizef, strideb));
+    EXPECT_FALSE(majel::contiguous(sizef, stridec));
+}
+
+TEST(Dim, Print) {
+    {
+        std::stringstream ss;
+        auto a = majel::make_dim(2, 3);
+        ss << a;
+        EXPECT_EQ(ss.str(), "2, 3");
+    }
+    {
+        std::stringstream ss;
+        ss << majel::make_dim(8);
+        EXPECT_EQ(ss.str(), "8");
+    }
+}
diff --git a/paddle/majel/test/place_test.cc b/paddle/majel/test/place_test.cc
index c9a53802b23ef8b225b9e8ef0acfe1b0c5562289..c5fa65ef6d63a07d7919f789c5c42ffe6908e327 100644
--- a/paddle/majel/test/place_test.cc
+++ b/paddle/majel/test/place_test.cc
@@ -1,6 +1,7 @@
-#include "majel/place.h"
+#include "paddle/majel/place.h"
 #include <sstream>
 #include "gtest/gtest.h"
+#include "paddle/utils/Logging.h"
 
 TEST(Place, Equality) {
   majel::CpuPlace cpu;
@@ -37,4 +38,5 @@ TEST(Place, Print) {
     ss << majel::CpuPlace();
     EXPECT_EQ("CpuPlace", ss.str());
   }
+  LOG(INFO) << "\n[----------] Done \n";
 }
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 3b6f0270de16627821624dd1266a0a1c089323b0..ec81e1dc3d21d1f16b8ad2988793074b838b8d4d 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -225,6 +225,24 @@ class LayerType(object):
 
 
 class AggregateLevel(object):
+    """
+    PaddlePaddle supports three sequence types:
+
+    - :code:`SequenceType.NO_SEQUENCE` means the sample is not a sequence.
+    - :code:`SequenceType.SEQUENCE` means the sample is a sequence.
+    - :code:`SequenceType.SUB_SEQUENCE` means the sample is a nested sequence,
+      each timestep of which is also a sequence.
+
+    Accordingly, AggregateLevel supports two modes:
+
+    - :code:`AggregateLevel.EACH_TIMESTEP` means the aggregation acts on each
+      timestep of a sequence, both :code:`SUB_SEQUENCE` and :code:`SEQUENCE` will
+      be aggregated to :code:`NO_SEQUENCE`.
+
+    - :code:`AggregateLevel.EACH_SEQUENCE` means the aggregation acts on each
+      sequence of a nested sequence, :code:`SUB_SEQUENCE` will be aggregated to
+      :code:`SEQUENCE`.
+    """
     EACH_TIMESTEP = 'non-seq'
     EACH_SEQUENCE = 'seq'
 
@@ -1454,6 +1472,19 @@ def first_seq(input,
 
 
 class ExpandLevel(object):
+    """
+    Please refer to AggregateLevel first.
+
+    ExpandLevel supports two modes:
+
+    - :code:`ExpandLevel.FROM_TIMESTEP` means the expandation acts on each
+      timestep of a sequence, :code:`NO_SEQUENCE` will be expanded to
+      :code:`SEQUENCE` or :code:`SUB_SEQUENCE`.
+
+    - :code:`ExpandLevel.FROM_SEQUENCE` means the expandation acts on each
+      sequence of a nested sequence, :code:`SEQUENCE` will be expanded to
+      :code:`SUB_SEQUENCE`.
+    """
     FROM_TIMESTEP = AggregateLevel.EACH_TIMESTEP
     FROM_SEQUENCE = AggregateLevel.EACH_SEQUENCE
 
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 13f53919aa49694f722d4bf20a7d01af3e3e6533..85ad6984ba08440d8f8c24a6ca5842024dbafe4b 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -1,10 +1,10 @@
 import numpy as np
 try:
     import cv2
-except ImportError:
-    cv2 = None
-
-from cv2 import resize
+except:
+    print(
+        "import cv2 error, please install opencv-python: pip install opencv-python"
+    )
 
 __all__ = [
     "load_image", "resize_short", "to_chw", "center_crop", "random_crop",
@@ -76,7 +76,7 @@ def resize_short(im, size):
         h_new = size * h / w
     else:
         w_new = size * w / h
-    im = resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
     return im
 
 
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 89cca7acd34b8dea0572169338649b5e9ff6536a..3d9caeec5897fcd5b9e084aff496d150efee2066 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -404,8 +404,8 @@ class RecurrentLayerOutput(Layer):
 LayerV2 = Layer
 data = DataLayerV2
 data.__name__ = 'data'
-AggregateLevel = conf_helps.layers.AggregateLevel
-ExpandLevel = conf_helps.layers.ExpandLevel
+AggregateLevel = conf_helps.AggregateLevel
+ExpandLevel = conf_helps.ExpandLevel
 memory = MemoryV2
 memory.__name__ = 'memory'
 memory.__doc__ = conf_helps.memory.__doc__
diff --git a/python/setup.py.in b/python/setup.py.in
index 7d9438e3f8132c2a7fa4774750f5fd15f3beed14..5dfb46192ae54fdc36b0867312cf156aefb84f84 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -18,7 +18,6 @@ setup(name='paddle',
           "numpy",
           "protobuf==${PROTOBUF_VERSION}",
           "matplotlib",
-          "opencv-python",
       ],
       packages=packages,
       package_dir={