diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 23d75b749998613e2e6e4acdf0e0d6438079e8e1..5d436a7e0c3752c889c19820507589f34d3bee94 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -190,18 +190,18 @@ op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 
 if (WITH_GPU)
-    op_library(conv_op DEPS vol2col depthwise_conv)
+    op_library(conv_op DEPS vol2col depthwise_conv im2col)
 else()
-    op_library(conv_op DEPS vol2col)
+    op_library(conv_op DEPS vol2col im2col)
 endif()
-op_library(conv_transpose_op DEPS vol2col)
+op_library(conv_transpose_op DEPS vol2col im2col)
 
 # FIXME(typhoonzero): save/load depends lodtensor serialization functions
 op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
-op_library(concat_op DEPS concat_functor)
+op_library(concat_op DEPS concat)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 751e69b1c89dd2f69b43758fe66adf39846a8ab2..a181d802262d15b188060dae4330cec0e24714ab 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,49 +1,59 @@
 add_subdirectory(detail)
 
-if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
-    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
-    nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
-    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
-    nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
-    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
-    nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
-    nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
-    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
-    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
-    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
-    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function)
-    nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
-    nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context)
-    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
-    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
-    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
-    nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context)
-    nv_library(concat_functor SRCS concat.cc concat.cu DEPS device_context tensor)
-else()
-    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
-    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
-    cc_library(softmax SRCS softmax.cc DEPS device_context)
-    cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
-    cc_library(pooling SRCS pooling.cc DEPS device_context)
-    cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
-    cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
-    cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
-    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function)
-    cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
-    cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context)
-    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
-    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
-    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
-    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
-    cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context)
-    cc_library(concat_functor SRCS concat.cc DEPS device_context tensor)
-endif()
+function(math_library TARGET)
+    # math_library is a function to create math library. 
+    # The interface is the same as cc_library. 
+    # But it handle split GPU/CPU code and link some common library.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(math_common_deps device_context framework_proto)
+    set(multiValueArgs DEPS)
+    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+        list(APPEND cc_srcs ${TARGET}.cc)
+    endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+        list(APPEND cu_srcs ${TARGET}.cu)
+    endif()
+
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif(${cc_srcs_len} GREATER 0)
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    endif()
+endfunction()
 
-cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
+# please add new math_library in alphabetical order
+math_library(concat)
+math_library(context_project DEPS im2col math_function)
+math_library(cross_entropy)
+math_library(cos_sim_functor)
+math_library(depthwise_conv)
+math_library(gru_compute DEPS activation_functions math_function)
+math_library(im2col)
+math_library(lstm_compute DEPS activation_functions)
+math_library(math_function DEPS cblas)
+math_library(maxouting)
+math_library(pooling)
+math_library(selected_rows_functor DEPS selected_rows)
+math_library(sequence2batch)
+math_library(sequence_padding)
+math_library(sequence_pooling DEPS math_function)
+math_library(sequence_scale)
+math_library(softmax)
+math_library(unpooling)
+math_library(vol2col)
+
+cc_test(math_function_test SRCS math_function_test.cc)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
-cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
-cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
+cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
+cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
 cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
-cc_test(concat_test SRCS concat_test.cc DEPS concat_functor tensor)
+if(WITH_GPU)
+    nv_test(math_function_gpu_test SRCS math_function_test.cu)
+    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
+endif()
+cc_test(concat_test SRCS concat_test.cc DEPS concat)
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
index 72bf2ab17016157f22bcf180090660346ed4f60d..8899abff360ea867872d3433722cdb37ef358500 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/python/paddle/fluid/memory_optimization_transpiler.py b/python/paddle/fluid/memory_optimization_transpiler.py
index 708ca08b17c85efbb25ecaf2580b7141421e25b9..4fa2d03ef563b98b2eec576bf87d4b2e54ca0a36 100644
--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/memory_optimization_transpiler.py
@@ -31,6 +31,8 @@ dtype_to_size = {
 
 sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"]
 
+PRINT_LOG = False
+
 
 class ControlFlowGraph(object):
     def __init__(self, Program, ops, forward_num, skip_opt):
@@ -171,12 +173,14 @@ class ControlFlowGraph(object):
                                 # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
                                 # and dtype_to_size[cache_dtype]
                                 if x_dtype == cache_dtype:
-                                    print(("Hit Cache !!!! cache pool index "
-                                           "is %d, var name is %s, "
-                                           "cached var name is %s, "
-                                           "var shape is %s ") %
-                                          (index, x, cache_var,
-                                           str(cache_shape)))
+                                    if PRINT_LOG:
+                                        print(
+                                            ("Hit Cache !!!! cache pool index "
+                                             "is %d, var name is %s, "
+                                             "cached var name is %s, "
+                                             "var shape is %s ") %
+                                            (index, x, cache_var,
+                                             str(cache_shape)))
                                     self.pool.pop(index)
                                     if x == cache_var:
                                         break
@@ -277,7 +281,9 @@ def _get_cfgs(input_program):
     return cfgs
 
 
-def memory_optimize(input_program):
+def memory_optimize(input_program, print_log=False):
+    global PRINT_LOG
+    PRINT_LOG = print_log
     cfgs = _get_cfgs(input_program)
     for cfg in cfgs:
         cfg.memory_optimize()
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index 7648bb9fe1c8d2dae3590fa67f781743ba297c32..c9d2a5ecaab0669f308b5b9c5cf74d0212fa462a 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -49,7 +49,7 @@ avg_cost = fluid.layers.mean(x=cost)
 sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
 sgd_optimizer.minimize(avg_cost)
 
-fluid.memory_optimize(fluid.default_main_program())
+fluid.memory_optimize(fluid.default_main_program(), print_log=True)
 
 BATCH_SIZE = 200