diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc85f83b94f22459002b17d66cb6ac98cbff9bd0..884afa962bbaff1defe610a9cd5b4a6e5d46c7c3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -92,6 +92,7 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 
+include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(cpplint)            # set paddle c++ style
 include(ccache)             # set ccache for compilation
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..22a26d7c5b04ba1f45de5ec9f3387c539ade730b
--- /dev/null
+++ b/cmake/generic.cmake
@@ -0,0 +1,129 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# To simplify the build process of PaddlePaddle, we defined couple of
+# fundamental abstractions, e.g., how to build library, binary and
+# test in C++, CUDA and Go.
+# 
+# -------------------------------------------
+#    C++	      CUDA C++	      Go
+# -------------------------------------------
+# cc_library	 nv_library	  go_library
+# cc_binary  	 nv_binary	  go_binary
+# cc_test        nv_test	  go_test
+# -------------------------------------------
+#
+# cmake_parse_arguments can help us to achieve this goal.
+# https://cmake.org/cmake/help/v3.0/module/CMakeParseArguments.html
+
+# cc_library parses tensor.cc and figures out that target also depend on tensor.h.
+# cc_library(tensor
+#   SRCS
+#   tensor.cc
+#   DEPS
+#   variant)
+function(cc_library TARGET_NAME)
+  set(options OPTIONAL)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if (${cc_library_OPTIONAL} STREQUAL "SHARED")
+    add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
+  else()
+    add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
+  endif()
+  add_dependencies(${TARGET_NAME} ${cc_library_DEPS} ${external_project_dependencies})
+endfunction(cc_library)
+
+# cc_binary parses tensor.cc and figures out that target also depend on tensor.h.
+# cc_binary(tensor
+#   SRCS
+#   tensor.cc)
+function(cc_binary TARGET_NAME)
+  set(options OPTIONAL)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  add_executable(${TARGET_NAME} ${cc_binary_SRCS})
+  add_dependencies(${TARGET_NAME} ${cc_binary_DEPS} ${external_project_dependencies})
+  target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
+endfunction(cc_binary)
+
+# The dependency to target tensor implies that if any of
+# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
+# cc_test(tensor_test
+#   SRCS
+#   tensor_test.cc
+#   DEPS
+#   tensor)
+function(cc_test TARGET_NAME)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  add_executable(${TARGET_NAME} ${cc_test_SRCS})
+  add_dependencies(${TARGET_NAME} ${cc_test_DEPS} ${external_project_dependencies})
+  target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES})
+  add_test(${TARGET_NAME} ${TARGET_NAME})
+endfunction(cc_test)
+
+# Suppose that ops.cu includes global functions that take Tensor as
+# their parameters, so ops depend on tensor. This implies that if
+# any of tensor.{h.cc}, ops.{h,cu} is changed, ops need to be re-built.
+# nv_library(ops
+#   SRCS
+#   ops.cu
+#   DEPS
+#   tensor)
+function(nv_library TARGET_NAME)
+  set(options OPTIONAL)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  if (${nv_library_OPTIONAL} STREQUAL "SHARED")
+    cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
+  else()
+    cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+  endif()
+  add_dependencies(${TARGET_NAME} ${nv_library_DEPS} ${external_project_dependencies})
+endfunction(nv_library)
+
+function(nv_binary TARGET_NAME)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
+  add_dependencies(${TARGET_NAME} ${nv_binary_DEPS} ${external_project_dependencies})
+  target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
+endfunction(nv_binary)
+
+# The dependency to target tensor implies that if any of
+# ops{.h,.cu,_test.cu} is changed, ops_test need to be re-built.
+# nv_test(ops_test
+#   SRCS
+#   ops_test.cu
+#   DEPS
+#   ops)
+function(nv_test TARGET_NAME)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
+  add_dependencies(${TARGET_NAME} ${nv_test_DEPS} ${external_project_dependencies})
+  target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} ${GTEST_MAIN_LIBRARIES} ${GTEST_LIBRARIES})
+  add_test(${TARGET_NAME} ${TARGET_NAME})
+endfunction(nv_test)
diff --git a/paddle/majel/CMakeLists.txt b/paddle/majel/CMakeLists.txt
index d4977df1185b4c13b7c67e24a80fa479e23d46d4..d4bce38906e9326992f6a44ac5cf25309063806a 100644
--- a/paddle/majel/CMakeLists.txt
+++ b/paddle/majel/CMakeLists.txt
@@ -1,43 +1,4 @@
-cmake_minimum_required(VERSION 3.0)
-
-if(${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_SOURCE_DIR})
-    # find #include <majel/xx.h>
-    get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-    include_directories(${PARENT_DIR})
-
-    # find cmake directory modules
-    get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-    set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
-
-    # enable boost
-    find_package(Boost REQUIRED)
-    if(NOT Boost_FOUND)
-        message(FATAL "Cannot find Boost library.")
-    endif()
-    include_directories(${Boost_INCLUDE_DIRS})
-
-    # enable c++11
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-    # enable gtest
-    set(THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/third_party)
-    set(WITH_TESTING ON)
-    include(external/gtest)
-else()
-    message("-- Found gtest (include: ${GTEST_INCLUDE_DIR}, library: ${GTEST_LIBRARIES})")
-endif()
-
-########################### Build Majel #############################
-set(MAJEL_CXX_FILES place.cc)
-set(MAJEL_CUDA_FILES "")
-
-if(CUDA_FOUND)
-    cuda_add_library(majel ${MAJEL_CUDA_FILES} ${MAJEL_CXX_FILES})
-else()
-    add_library(majel ${MAJEL_CXX_FILES})
-endif()
-add_dependencies(majel ${external_project_dependencies})
-#####################################################################
+cc_library(majel SRCS place.cc)
 
 if(WITH_TESTING)
     add_subdirectory(test)
diff --git a/paddle/majel/test/CMakeLists.txt b/paddle/majel/test/CMakeLists.txt
index 76327fdd70c3f1763abda1f38c137dbaf27fba30..68f9059874aed8843da1fc598c7d2e57e9b8bbfe 100644
--- a/paddle/majel/test/CMakeLists.txt
+++ b/paddle/majel/test/CMakeLists.txt
@@ -1,10 +1,7 @@
-file(GLOB_RECURSE ALL_TEST_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+cc_test(place_test
+    SRCS place_test.cc
+    DEPS majel)
 
-add_executable(majel_tests ${ALL_TEST_FILES})
-add_dependencies(majel_tests majel)
-target_link_libraries(majel_tests     
-                      ${GTEST_LIBRARIES}
-                      ${GTEST_MAIN_LIBRARIES}
-                      majel
-                     )
-add_test(majel_tests majel_tests)
+if(WITH_GPU)
+    nv_test(cuda_test SRCS cuda_test.cu)
+endif()
diff --git a/paddle/majel/test/cuda_test.cu b/paddle/majel/test/cuda_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4067dda2f19f7661722d8a14a27c7b32ed6afc92
--- /dev/null
+++ b/paddle/majel/test/cuda_test.cu
@@ -0,0 +1,59 @@
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include "gtest/gtest.h"
+
+#define CHECK_ERR(x)                 \
+  if (x != cudaSuccess) {            \
+    fprintf(stderr,                  \
+            "%s in %s at line %d\n", \
+            cudaGetErrorString(err), \
+            __FILE__,                \
+            __LINE__);               \
+    exit(-1);                        \
+  }
+
+__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) {
+    d_C[i] = d_A[i] + d_B[i];
+  }
+}
+
+TEST(Cuda, Equality) {
+  int n = 10;
+  // Memory allocation for h_A, h_B and h_C (in the host)
+  float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0};
+  float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
+  float h_C[10];
+  float *d_A, *d_B, *d_C;
+  cudaError_t err;
+  // Memory allocation for d_A, d_B and d_C (in the device)
+  err = cudaMalloc((void **)&d_A, sizeof(float) * n);
+  CHECK_ERR(err);
+
+  err = cudaMalloc((void **)&d_B, sizeof(float) * n);
+  CHECK_ERR(err);
+
+  err = cudaMalloc((void **)&d_C, sizeof(float) * n);
+  CHECK_ERR(err);
+
+  // Copying memory to device
+  err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice);
+  CHECK_ERR(err);
+
+  err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice);
+  CHECK_ERR(err);
+
+  // Calling the kernel
+  vecAdd<<<ceil(n / 256.0), 256>>>(d_A, d_B, d_C, n);
+
+  // Copying results back to host
+  err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost);
+  CHECK_ERR(err);
+
+  EXPECT_EQ(h_C[0], 1.0);
+  for (int i = 1; i < n - 1; ++i) {
+    EXPECT_EQ(h_C[i], 11.0);
+  }
+  EXPECT_EQ(h_C[9], 1.0);
+}