diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index 2d81fd431716f9f1aef3d9b76c166807495cfb17..8cfd026f8ff8e044ffbd2cc76c34843072261ab1 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -11,4 +11,3 @@ endforeach()
 
 add_subdirectory(unittests)
 add_subdirectory(book)
-add_subdirectory(book_memory_optimization)
diff --git a/python/paddle/fluid/tests/book_memory_optimization/CMakeLists.txt b/python/paddle/fluid/tests/book_memory_optimization/CMakeLists.txt
deleted file mode 100644
index 213af5d27f711214feda3d200ced57bf71fbf6c2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_memory_optimization/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-list(REMOVE_ITEM TEST_OPS test_memopt_image_classification_train)
-py_test(test_memopt_image_classification_train_resnet SRCS test_memopt_image_classification_train.py ARGS resnet)
-py_test(test_memopt_image_classification_train_vgg SRCS test_memopt_image_classification_train.py ARGS vgg)
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
deleted file mode 100644
index a231bbfbc8d5712275c92b4d27580016825ea91b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ /dev/null
@@ -1,168 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import sys
-
-import paddle
-import paddle.fluid as fluid
-import math
-import sys
-
-# need to fix random seed and training data to compare the loss
-# value accurately calculated by the default and the memory optimization
-# version.
-fluid.default_startup_program().random_seed = 111
-
-
-def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-        tmp = fluid.layers.conv2d(
-            input=input,
-            filter_size=filter_size,
-            num_filters=ch_out,
-            stride=stride,
-            padding=padding,
-            act=None,
-            bias_attr=False)
-        return fluid.layers.batch_norm(input=tmp, act=act)
-
-    def shortcut(input, ch_in, ch_out, stride):
-        if ch_in != ch_out:
-            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
-        else:
-            return input
-
-    def basicblock(input, ch_in, ch_out, stride):
-        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
-        short = shortcut(input, ch_in, ch_out, stride)
-        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
-
-    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
-        tmp = block_func(input, ch_in, ch_out, stride)
-        for i in range(1, count):
-            tmp = block_func(tmp, ch_out, ch_out, 1)
-        return tmp
-
-    assert (depth - 2) % 6 == 0
-    n = (depth - 2) // 6
-    conv1 = conv_bn_layer(
-        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
-    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
-    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
-    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = fluid.layers.pool2d(
-        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
-    return pool
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    return fc2
-
-
-classdim = 10
-data_shape = [3, 32, 32]
-
-images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-net_type = "vgg"
-if len(sys.argv) >= 2:
-    net_type = sys.argv[1]
-
-if net_type == "vgg":
-    print("train vgg net")
-    net = vgg16_bn_drop(images)
-elif net_type == "resnet":
-    print("train resnet")
-    net = resnet_cifar10(images, 32)
-else:
-    raise ValueError("%s network is not supported" % net_type)
-
-predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(cost)
-
-optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-opts = optimizer.minimize(avg_cost)
-
-batch_size = fluid.layers.create_tensor(dtype='int64')
-batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size)
-
-fluid.memory_optimize(fluid.default_main_program(), level=0)
-# fluid.release_memory(fluid.default_main_program())
-
-BATCH_SIZE = 16
-PASS_NUM = 1
-
-# fix the order of training data
-train_reader = paddle.batch(
-    paddle.dataset.cifar.train10(), batch_size=BATCH_SIZE)
-
-# train_reader = paddle.batch(
-#     paddle.reader.shuffle(
-#         paddle.dataset.cifar.train10(), buf_size=128 * 10),
-#     batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
-exe.run(fluid.default_startup_program())
-
-i = 0
-
-accuracy = fluid.average.WeightedAverage()
-for pass_id in range(PASS_NUM):
-    accuracy.reset()
-    for data in train_reader():
-        loss, acc, weight = exe.run(
-            fluid.default_main_program(),
-            feed=feeder.feed(data),
-            fetch_list=[avg_cost, batch_acc, batch_size])
-        accuracy.add(value=acc, weight=weight)
-        pass_acc = accuracy.eval()
-        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
-            pass_acc))
-        # this model is slow, so if we can train two mini batch, we think it works properly.
-        if i > 0:
-            exit(0)
-        if math.isnan(float(loss)):
-            sys.exit("got NaN loss, training failed.")
-        i += 1
-exit(1)
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
deleted file mode 100644
index e520c8965089263d1ba10a6057acda1a53cc34a9..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ /dev/null
@@ -1,139 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.framework as framework
-import paddle.fluid.layers as layers
-from paddle.fluid.executor import Executor
-import math
-import sys
-
-dict_size = 30000
-source_dict_dim = target_dict_dim = dict_size
-src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-hidden_dim = 32
-word_dim = 16
-IS_SPARSE = True
-batch_size = 10
-max_length = 50
-topk_size = 50
-trg_dic_size = 10000
-
-decoder_size = hidden_dim
-
-# need to fix random seed and training data to compare the loss
-# value accurately calculated by the default and the memory optimization
-# version.
-fluid.default_startup_program().random_seed = 111
-
-
-def encoder_decoder():
-    # encoder
-    src_word_id = layers.data(
-        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
-    src_embedding = layers.embedding(
-        input=src_word_id,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr=fluid.ParamAttr(name='vemb'))
-
-    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
-    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
-    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
-
-    # decoder
-    trg_language_word = layers.data(
-        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
-    trg_embedding = layers.embedding(
-        input=trg_language_word,
-        size=[dict_size, word_dim],
-        dtype='float32',
-        is_sparse=IS_SPARSE,
-        param_attr=fluid.ParamAttr(name='vemb'))
-
-    rnn = fluid.layers.DynamicRNN()
-    with rnn.block():
-        current_word = rnn.step_input(trg_embedding)
-        mem = rnn.memory(init=encoder_out)
-        fc1 = fluid.layers.fc(input=[current_word, mem],
-                              size=decoder_size,
-                              act='tanh')
-        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
-        rnn.update_memory(mem, fc1)
-        rnn.output(out)
-
-    return rnn()
-
-
-def main():
-    rnn_out = encoder_decoder()
-    label = layers.data(
-        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
-    cost = layers.cross_entropy(input=rnn_out, label=label)
-    avg_cost = fluid.layers.mean(cost)
-
-    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
-    optimizer.minimize(avg_cost)
-
-    fluid.memory_optimize(fluid.default_main_program())
-    # fluid.release_memory(fluid.default_main_program())
-
-    # fix the order of training data
-    train_data = paddle.batch(
-        paddle.dataset.wmt14.train(dict_size), batch_size=batch_size)
-
-    # train_data = paddle.batch(
-    #     paddle.reader.shuffle(
-    #         paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-    #     batch_size=batch_size)
-
-    place = core.CPUPlace()
-    exe = Executor(place)
-
-    exe.run(framework.default_startup_program())
-
-    feed_order = [
-        'src_word_id', 'target_language_word', 'target_language_next_word'
-    ]
-
-    feed_list = [
-        fluid.default_main_program().global_block().var(var_name)
-        for var_name in feed_order
-    ]
-    feeder = fluid.DataFeeder(feed_list, place)
-
-    batch_id = 0
-    for pass_id in range(10):
-        for data in train_data():
-            outs = exe.run(fluid.default_main_program(),
-                           feed=feeder.feed(data),
-                           fetch_list=[avg_cost])
-            avg_cost_val = np.array(outs[0])
-            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
-                  " avg_cost=" + str(avg_cost_val))
-            if batch_id > 2:
-                exit(0)
-            if math.isnan(float(avg_cost_val)):
-                sys.exit("got NaN loss, training failed.")
-            batch_id += 1
-
-
-if __name__ == '__main__':
-    main()