Merge pull request #12390 from velconia/port_python3_syntax

Apply 2to3 to current paddle main python code

Merge pull request #12390 from velconia/port_python3_syntax
Apply 2to3 to current paddle main python code
29fac3c0 · Qiyang Min · GitHub · 5a9ae411 · 39059eb7 · 29fac3c0
170 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,7 @@ option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VER
 if(NOT PY_VERSION)
  set(PY_VERSION 2.7)
 endif()
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})

 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -394,8 +394,10 @@ All parameter, weight, gradient are variables in Paddle.
    InferenceOptimize(*(origin.Proto()), &pruned_desc);
    return new ProgramDesc(pruned_desc);
  });
-  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
-  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
+  m.def("empty_var_name",
+        []() { return std::string(framework::kEmptyVarName); });
+  m.def("grad_var_suffix",
+        []() { return std::string(framework::kGradVarSuffix); });
  m.def_submodule(
       "var_names",
       "The module will return special predefined variable name in Paddle")

--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -28,11 +28,12 @@ images per class.

 """

-import cPickle
 import itertools
 import numpy
 import paddle.dataset.common
 import tarfile
+from six.moves import zip
+from six.moves import cPickle as pickle

 __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']

@@ -48,7 +49,7 @@ def reader_creator(filename, sub_name, cycle=False):
        data = batch['data']
        labels = batch.get('labels', batch.get('fine_labels', None))
        assert labels is not None
-        for sample, label in itertools.izip(data, labels):
+        for sample, label in zip(data, labels):
            yield (sample / 255.0).astype(numpy.float32), int(label)

    def reader():
@@ -58,7 +59,7 @@ def reader_creator(filename, sub_name, cycle=False):

            while True:
                for name in names:
-                    batch = cPickle.load(f.extractfile(name))
+                    batch = pickle.load(f.extractfile(name))
                    for item in read_batch(batch):
                        yield item
                if not cycle:

--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -20,9 +20,8 @@ import shutil
 import sys
 import importlib
 import paddle.dataset
-import cPickle
+import six.moves.cPickle as pickle
 import glob
-import cPickle as pickle

 __all__ = [
    'DATA_HOME',
@@ -75,13 +74,13 @@ def download(url, module_name, md5sum, save_name=None):
    retry_limit = 3
    while not (os.path.exists(filename) and md5file(filename) == md5sum):
        if os.path.exists(filename):
-            print "file md5", md5file(filename), md5sum
+            print("file md5", md5file(filename), md5sum)
        if retry < retry_limit:
            retry += 1
        else:
            raise RuntimeError("Cannot download {0} within retry limit {1}".
                               format(url, retry_limit))
-        print "Cache file %s not found, downloading %s" % (filename, url)
+        print("Cache file %s not found, downloading %s" % (filename, url))
        r = requests.get(url, stream=True)
        total_length = r.headers.get('content-length')

@@ -104,8 +103,9 @@ def download(url, module_name, md5sum, save_name=None):


 def fetch_all():
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.dataset)):
+    for module_name in [
+            x for x in dir(paddle.dataset) if not x.startswith("__")
+    ]:
        if "fetch" in dir(
                importlib.import_module("paddle.dataset.%s" % module_name)):
            getattr(
@@ -114,8 +114,9 @@ def fetch_all():


 def fetch_all_recordio(path):
-    for module_name in filter(lambda x: not x.startswith("__"),
-                              dir(paddle.dataset)):
+    for module_name in [
+            x for x in dir(paddle.dataset) if not x.startswith("__")
+    ]:
        if "convert" in dir(
                importlib.import_module("paddle.dataset.%s" % module_name)) and \
                not module_name == "common":
@@ -126,7 +127,7 @@ def fetch_all_recordio(path):
                "convert")(ds_path)


-def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
    """
    you can call the function as:

@@ -167,7 +168,7 @@ def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
 def cluster_files_reader(files_pattern,
                         trainer_count,
                         trainer_id,
-                         loader=cPickle.load):
+                         loader=pickle.load):
    """
    Create a reader that yield element from the given files, select
    a file set according trainer count and trainer_id
@@ -188,7 +189,7 @@ def cluster_files_reader(files_pattern,
        my_file_list = []
        for idx, fn in enumerate(file_list):
            if idx % trainer_count == trainer_id:
-                print "append file: %s" % fn
+                print("append file: %s" % fn)
                my_file_list.append(fn)
        for fn in my_file_list:
            with open(fn, "r") as f:
@@ -221,7 +222,7 @@ def convert(output_path, reader, line_count, name_prefix):
        for l in lines:
            # FIXME(Yancey1989):
            # dumps with protocol: pickle.HIGHEST_PROTOCOL
-            writer.write(cPickle.dumps(l))
+            writer.write(pickle.dumps(l))
        writer.close()

    lines = []

--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -24,6 +24,7 @@ import tarfile
 import gzip
 import itertools
 import paddle.dataset.common
+from six.moves import zip

 __all__ = ['test, get_dict', 'get_embedding', 'convert']

@@ -87,12 +88,12 @@ def corpus_reader(data_path, words_name, props_name):
            sentences = []
            labels = []
            one_seg = []
-            for word, label in itertools.izip(words_file, props_file):
+            for word, label in zip(words_file, props_file):
                word = word.strip()
                label = label.strip().split()

                if len(label) == 0:  # end of sentence
-                    for i in xrange(len(one_seg[0])):
+                    for i in range(len(one_seg[0])):
                        a_kind_lable = [x[i] for x in one_seg]
                        labels.append(a_kind_lable)


--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -28,10 +28,9 @@ Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.

 """
-import cPickle
 import itertools
 import functools
-from common import download
+from .common import download
 import tarfile
 import scipy.io as scio
 from paddle.dataset.image import *
@@ -39,6 +38,8 @@ from paddle.reader import *
 import os
 import numpy as np
 from multiprocessing import cpu_count
+from six.moves import cPickle as pickle
+from six.moves import zip
 __all__ = ['train', 'test', 'valid']

 DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
@@ -116,10 +117,10 @@ def reader_creator(data_file,
                file = file.strip()
                batch = None
                with open(file, 'r') as f:
-                    batch = cPickle.load(f)
+                    batch = pickle.load(f)
                data = batch['data']
                labels = batch['label']
-                for sample, label in itertools.izip(data, batch['label']):
+                for sample, label in zip(data, batch['label']):
                    yield sample, int(label) - 1
            if not cycle:
                break

--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -36,7 +36,7 @@ except ImportError:
    cv2 = None
 import os
 import tarfile
-import cPickle
+import six.moves.cPickle as pickle

 __all__ = [
    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
@@ -86,10 +86,10 @@ def batch_images_from_tar(data_file,
                output = {}
                output['label'] = labels
                output['data'] = data
-                cPickle.dump(
+                pickle.dump(
                    output,
                    open('%s/batch_%d' % (out_path, file_id), 'w'),
-                    protocol=cPickle.HIGHEST_PROTOCOL)
+                    protocol=pickle.HIGHEST_PROTOCOL)
                file_id += 1
                data = []
                labels = []
@@ -97,10 +97,10 @@ def batch_images_from_tar(data_file,
        output = {}
        output['label'] = labels
        output['data'] = data
-        cPickle.dump(
+        pickle.dump(
            output,
            open('%s/batch_%d' % (out_path, file_id), 'w'),
-            protocol=cPickle.HIGHEST_PROTOCOL)
+            protocol=pickle.HIGHEST_PROTOCOL)

    with open(meta_file, 'a') as meta:
        for file in os.listdir(out_path):

--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -42,13 +42,13 @@ def tokenize(pattern):
        # sequential access of member files, other than
        # tarfile.extractfile, which does random access and might
        # destroy hard disks.
-        tf = tarf.next()
+        tf = next(tarf)
        while tf != None:
            if bool(pattern.match(tf.name)):
                # newline and punctuations removal and ad-hoc tokenization.
                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
                    None, string.punctuation).lower().split()
-            tf = tarf.next()
+            tf = next(tarf)


 def build_dict(pattern, cutoff):
@@ -62,11 +62,11 @@ def build_dict(pattern, cutoff):
            word_freq[word] += 1

    # Not sure if we should prune less-frequent words here.
-    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+    word_freq = [x for x in list(word_freq.items()) if x[1] > cutoff]

    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*dictionary))
-    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx = dict(list(zip(words, list(range(len(words))))))
    word_idx['<unk>'] = len(words)
    return word_idx


--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -64,11 +64,11 @@ def build_dict(min_word_freq=50):
            # remove <unk> for now, since we will set it as last index
            del word_freq['<unk>']

-        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+        word_freq = [x for x in list(word_freq.items()) if x[1] > min_word_freq]

        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
        words, _ = list(zip(*word_freq_sorted))
-        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx = dict(list(zip(words, list(range(len(words))))))
        word_idx['<unk>'] = len(words)

    return word_idx

--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -65,7 +65,7 @@ def reader_creator(image_filename, label_filename, buffer_size):

                images = images / 255.0 * 2.0 - 1.0

-                for i in xrange(buffer_size):
+                for i in range(buffer_size):
                    yield images[i, :], int(labels[i])
        finally:
            try:

--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -16,7 +16,7 @@ Movielens 1-M dataset.

 Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
 movies, which was collected by GroupLens Research. This module will download
-Movielens 1-M dataset from 
+Movielens 1-M dataset from
 http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
 set and test set into paddle reader creators.

@@ -187,7 +187,7 @@ def max_movie_id():
    Get the maximum value of movie id.
    """
    __initialize_meta_info__()
-    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+    return reduce(__max_index_info__, list(MOVIE_INFO.values())).index


 def max_user_id():
@@ -195,7 +195,7 @@ def max_user_id():
    Get the maximum value of user id.
    """
    __initialize_meta_info__()
-    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+    return reduce(__max_index_info__, list(USER_INFO.values())).index


 def __max_job_id_impl__(a, b):
@@ -210,7 +210,7 @@ def max_job_id():
    Get the maximum value of job id.
    """
    __initialize_meta_info__()
-    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+    return reduce(__max_job_id_impl__, list(USER_INFO.values())).job_id


 def movie_categories():
@@ -243,7 +243,7 @@ def unittest():
    for test_count, _ in enumerate(test()()):
        pass

-    print train_count, test_count
+    print(train_count, test_count)


 def fetch():

--- a/python/paddle/dataset/mq2007.py
+++ b/python/paddle/dataset/mq2007.py
@@ -26,7 +26,7 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
 import os
 import functools
 import rarfile
-from common import download
+from .common import download
 import numpy as np

 # URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
@@ -53,7 +53,7 @@ class Query(object):
  ----------
  query_id : int
    query_id in dataset, mapping from query to relevance documents
-  relevance_score : int 
+  relevance_score : int
    relevance score of query and document pair
  feature_vector : array, dense feature
    feature in vector format
@@ -92,7 +92,7 @@ class Query(object):
            sys.stdout.write("expect 48 space split parts, get %d" %
                             (len(parts)))
            return None
-        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        # format : 0 qid:10 1:0.000272 2:0.000000 ....
        self.relevance_score = int(parts[0])
        self.query_id = int(parts[1].split(':')[1])
        for p in parts[2:]:
@@ -295,7 +295,7 @@ def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
  --------
  filename : string
  fill_missing : fill the missing value. default in MQ2007 is -1
-  
+
  Returns
  ------
  yield
@@ -330,4 +330,4 @@ if __name__ == "__main__":
    mytest = functools.partial(
        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
    for label, query in mytest():
-        print label, query
+        print(label, query)
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -43,11 +43,11 @@ def download_data_if_not_yet():
            nltk.data.path.append(paddle.dataset.common.DATA_HOME)
        movie_reviews.categories()
    except LookupError:
-        print "Downloading movie_reviews data set, please wait....."
+        print("Downloading movie_reviews data set, please wait.....")
        nltk.download(
            'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
-        print "Download data set success....."
-        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+        print("Download data set success.....")
+        print("Path is " + nltk.data.find('corpora/movie_reviews').path)


 def get_word_dict():
@@ -64,7 +64,7 @@ def get_word_dict():
        for field in movie_reviews.fileids(category):
            for words in movie_reviews.words(field):
                word_freq_dict[words] += 1
-    words_sort_list = word_freq_dict.items()
+    words_sort_list = list(word_freq_dict.items())
    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
    for index, word in enumerate(words_sort_list):
        words_freq_sorted.append((word[0], index))
@@ -80,7 +80,8 @@ def sort_files():
    files_list = list()
    neg_file_list = movie_reviews.fileids('neg')
    pos_file_list = movie_reviews.fileids('pos')
-    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    files_list = list(
+        chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
    return files_list



--- a/python/paddle/dataset/tests/common_test.py
+++ b/python/paddle/dataset/tests/common_test.py
@@ -36,7 +36,7 @@ class TestCommon(unittest.TestCase):
    def test_split(self):
        def test_reader():
            def reader():
-                for x in xrange(10):
+                for x in range(10):
                    yield x

            return reader
@@ -49,7 +49,7 @@ class TestCommon(unittest.TestCase):

    def test_cluster_file_reader(self):
        _, temp_path = tempfile.mkstemp()
-        for x in xrange(5):
+        for x in range(5):
            with open(temp_path + '/%05d.test' % x) as f:
                f.write('%d\n' % x)
        reader = paddle.dataset.common.cluster_files_reader(
@@ -63,7 +63,7 @@ class TestCommon(unittest.TestCase):

        def test_reader():
            def reader():
-                for x in xrange(record_num):
+                for x in range(record_num):
                    yield x

            return reader

--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -59,7 +59,7 @@ class TestMikolov(unittest.TestCase):
        self.assertEqual(first_line, read_line)

    def test_total(self):
-        _, idx = zip(*WORD_DICT.items())
+        _, idx = list(zip(*list(WORD_DICT.items())))
        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)



--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -24,9 +24,8 @@ from nltk.corpus import movie_reviews
 class TestSentimentMethods(unittest.TestCase):
    def test_get_word_dict(self):
        word_dict = st.get_word_dict()[0:10]
-        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
-                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
-                          (u'is', 8), (u'in', 9)]
+        test_word_list = [(',', 0), ('the', 1), ('.', 2), ('a', 3), ('and', 4),
+                          ('of', 5), ('to', 6), ("'", 7), ('is', 8), ('in', 9)]
        for idx, each in enumerate(word_dict):
            self.assertEqual(each, test_word_list[idx])
        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)

--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -49,9 +49,12 @@ def feature_range(maximums, minimums):
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots()
    feature_num = len(maximums)
-    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.bar(list(range(feature_num)),
+           maximums - minimums,
+           color='r',
+           align='center')
    ax.set_title('feature scale')
-    plt.xticks(range(feature_num), feature_names)
+    plt.xticks(list(range(feature_num)), feature_names)
    plt.xlim([-1, feature_num])
    fig.set_figheight(6)
    fig.set_figwidth(10)
@@ -71,7 +74,7 @@ def load_data(filename, feature_num=14, ratio=0.8):
    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
        axis=0) / data.shape[0]
    feature_range(maximums[:-1], minimums[:-1])
-    for i in xrange(feature_num - 1):
+    for i in range(feature_num - 1):
        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
    offset = int(data.shape[0] * ratio)
    UCI_TRAIN_DATA = data[:offset]

--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -154,8 +154,8 @@ def get_dict(dict_size, reverse=True):
    tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
    if reverse:
-        src_dict = {v: k for k, v in src_dict.items()}
-        trg_dict = {v: k for k, v in trg_dict.items()}
+        src_dict = {v: k for k, v in list(src_dict.items())}
+        trg_dict = {v: k for k, v in list(trg_dict.items())}
    return src_dict, trg_dict



--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -70,7 +70,9 @@ def __build_dict(tar_file, dict_size, save_path, lang):
        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
        for idx, word in enumerate(
                sorted(
-                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
+                    iter(list(word_dict.items())),
+                    key=lambda x: x[1],
+                    reverse=True)):
            if idx + 3 == dict_size: break
            fout.write("%s\n" % (word[0]))


--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -14,49 +14,49 @@

 from __future__ import print_function
 # import all class inside framework into fluid module
-import framework
-from framework import *
+from . import framework
+from .framework import *
 # import all class inside executor into fluid module
-import executor
-from executor import *
-
-import trainer
-from trainer import Trainer
-from trainer import BeginEpochEvent
-from trainer import EndEpochEvent
-from trainer import BeginStepEvent
-from trainer import EndStepEvent
-from trainer import CheckpointConfig
-
-import inferencer
-from inferencer import Inferencer
-
-import io
-import evaluator
-import initializer
-import layers
-import contrib
-import nets
-import optimizer
-import backward
-import regularizer
-import average
-import metrics
-import transpiler
-from param_attr import ParamAttr, WeightNormParamAttr
-from data_feeder import DataFeeder
-from core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
-from transpiler import DistributeTranspiler, InferenceTranspiler, \
+from . import executor
+from .executor import *
+
+from . import trainer
+from .trainer import Trainer
+from .trainer import BeginEpochEvent
+from .trainer import EndEpochEvent
+from .trainer import BeginStepEvent
+from .trainer import EndStepEvent
+from .trainer import CheckpointConfig
+
+from . import inferencer
+from .inferencer import Inferencer
+
+from . import io
+from . import evaluator
+from . import initializer
+from . import layers
+from . import contrib
+from . import nets
+from . import optimizer
+from . import backward
+from . import regularizer
+from . import average
+from . import metrics
+from . import transpiler
+from .param_attr import ParamAttr, WeightNormParamAttr
+from .data_feeder import DataFeeder
+from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
+from .transpiler import DistributeTranspiler, InferenceTranspiler, \
    memory_optimize, release_memory, DistributeTranspilerConfig
-from concurrency import (Go, make_channel, channel_send, channel_recv,
-                         channel_close, Select)
-from lod_tensor import create_lod_tensor, create_random_int_lodtensor
-import clip
-import profiler
-import unique_name
-import recordio_writer
-import parallel_executor
-from parallel_executor import *
+from .concurrency import (Go, make_channel, channel_send, channel_recv,
+                          channel_close, Select)
+from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
+from . import clip
+from . import profiler
+from . import unique_name
+from . import recordio_writer
+from . import parallel_executor
+from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable

 Tensor = LoDTensor
@@ -99,8 +99,8 @@ def __bootstrap__():
        None
    """
    import sys
-    import core
    import os
+    from . import core

    in_test = 'unittest' in sys.modules


--- a/python/paddle/fluid/annotations.py
+++ b/python/paddle/fluid/annotations.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
 import functools
 import sys

@@ -28,7 +29,7 @@ def deprecated(since, instead, extra_message=""):

        @functools.wraps(func)
        def wrapper(*args, **kwargs):
-            print >> sys.stderr, err_msg
+            print(err_msg, file=sys.stderr)
            return func(*args, **kwargs)

        wrapper.__doc__ += "\n    "

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -16,7 +16,8 @@ from paddle.fluid import framework as framework
 from . import core
 import collections
 import copy
-import unique_name
+import six
+from . import unique_name

 __all__ = ['append_backward']

@@ -44,17 +45,25 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
    """
    op_desc = core.OpDesc()
    op_desc.set_type(op_type)
-    for para, args in inputs.iteritems():
-        op_desc.set_input(para, args)
-    for para, args in outputs.iteritems():
-        op_desc.set_output(para, args)
+    for para, args in list(inputs.items()):
+        op_desc.set_input(
+            para,
+            list(
+                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
+                    args)))
+    for para, args in list(outputs.items()):
+        op_desc.set_output(
+            para,
+            list(
+                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
+                    args)))

    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()

    if op_role_attr_name not in attrs:
        attrs[
            op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
-    for name, val in attrs.iteritems():
+    for name, val in list(attrs.items()):
        if isinstance(val, framework.Block):
            op_desc.set_block_attr(name, val.desc)
        else:
@@ -105,7 +114,9 @@ def _strip_grad_suffix_(name):
    e.g. x@GRAD ==> x
         y@GRAD@RENAME@1 ==> y
    """
-    pos = name.find(core.grad_var_suffix())
+    if isinstance(name, six.text_type):
+        name = name.encode()
+    pos = name.find(six.b(core.grad_var_suffix()))
    return name[:pos] if pos != -1 else name


@@ -114,7 +125,9 @@ def _append_grad_suffix_(name):
    Append grad suffix to the given variable name
    e.g. x ==> x@GRAD
    """
-    return name + core.grad_var_suffix()
+    if isinstance(name, six.text_type):
+        name = name.encode()
+    return name + six.b(core.grad_var_suffix())


 def _addup_repetitive_outputs_(op_descs):
@@ -174,7 +187,7 @@ def _addup_repetitive_outputs_(op_descs):
                    op_desc.set_output(param_name, arg_names)
                    renamed_vars[var_name].append(new_name)

-    for var_name, inputs in renamed_vars.iteritems():
+    for var_name, inputs in list(renamed_vars.items()):
        if len(inputs) > 1:
            pending_sum_ops.append(
                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
@@ -198,16 +211,19 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
        out_arg_names = op_desc.output_arg_names()
        if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
            return True
-        if _all_in_set_(
-                filter(lambda name: name.find(core.grad_var_suffix()) != -1,
-                       op_desc.input_arg_names()), no_grad_set):
+        if _all_in_set_([
+                name for name in op_desc.input_arg_names()
+                if name.find(core.grad_var_suffix()) != -1
+        ], no_grad_set):
            no_grad_set.update(out_arg_names)
            return True
        return False

    # Remove ops whose outputs are all in no_grad_dict
-    op_descs = filter(
-        lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
+    op_descs = [
+        op_desc for op_desc in op_descs
+        if not _op_can_be_removed_(op_desc, no_grad_set)
+    ]
    # Insert fill_zeros_like_op
    to_insert = []
    for idx, op_desc in enumerate(op_descs):
@@ -217,12 +233,12 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
                    "X": [_strip_grad_suffix_(arg)]
                }, {"Out": [arg]}, {}), idx))

-    map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
+    list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])

    return op_descs


-import proto.framework_pb2 as framework_pb2
+from .proto import framework_pb2


 def serialize_op_decs(op_desc):
@@ -244,8 +260,10 @@ def _callback_lookup_(op):
    if op.type == 'parallel_do' and op.attr('use_nccl'):
        all_vars = op.block.vars
        param_names = set(op.input('parameters'))
-        param_names = filter(lambda name: all_vars[name].stop_gradient is False,
-                             param_names)
+        param_names = [
+            name for name in param_names
+            if all_vars[name].stop_gradient is False
+        ]
        param_grad_names = [n + "@GRAD" for n in param_names]

        class ParallelDoCallBack(object):
@@ -399,7 +417,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
                continue
            block.desc.var(grad_var_name)
            new_vars.add(grad_var_name)
-            if not grad_to_var.has_key(grad_var_name):
+            if grad_var_name not in grad_to_var:
                continue
            grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
        # infer_shape and infer_type
@@ -427,7 +445,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
                op_desc.rename_output(name, new_name)
                var_map[name] = new_name

-    for g, ng in var_map.iteritems():
+    for g, ng in list(var_map.items()):
        if g in grad_to_var:
            grad_to_var[ng] = grad_to_var[g]
            grad_to_var.pop(g)
@@ -439,7 +457,7 @@ def _get_stop_gradients_(program):
    for block in program.blocks:
        assert isinstance(block, framework.Block)
        block_no_grad_set = set()
-        for var in block.vars.itervalues():
+        for var in list(block.vars.values()):
            assert isinstance(var, framework.Variable)
            if var.stop_gradient:
                block_no_grad_set.add(_append_grad_suffix_(var.name))
@@ -452,51 +470,51 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
    """
    Append backward part to main_program.

-    A complete neural network training is made up of forward and backward 
-    propagation. However, when we configure a network, we only need to 
-    specify its forwrd part. The backward part is generated automatically 
+    A complete neural network training is made up of forward and backward
+    propagation. However, when we configure a network, we only need to
+    specify its forwrd part. The backward part is generated automatically
    according to the forward part by this function.

-    In most cases, users do not need to invoke this function manually. It 
+    In most cases, users do not need to invoke this function manually. It
    will be automatically invoked by the optimizer's `minimize` function.

    Args:
        loss(Variable): The loss variable of the network.
-        parameter_list(list[string]|None): Names of parameters that need 
-                                           to be updated by optimizers. 
-                                           If it is None, all parameters 
+        parameter_list(list[string]|None): Names of parameters that need
+                                           to be updated by optimizers.
+                                           If it is None, all parameters
                                           will be updated.
                                           Default: None
-        no_grad_set(set|None): Variables in the Block 0 whose gradients 
-                               should be ignored. All variables with 
-                               `step_gradient=True` from all blocks will 
+        no_grad_set(set|None): Variables in the Block 0 whose gradients
+                               should be ignored. All variables with
+                               `step_gradient=True` from all blocks will
                               be automatically added into this set.
                               Default: None
-        callbacks(list[callable object]|None): The callbacks are used for 
-                                               doing some custom jobs during 
-                                               backward part building. All 
-                                               callable objects in it will 
-                                               be invoked once each time a 
-                                               new gradient operator is added 
-                                               into the program. The callable 
-                                               object must has two input 
-                                               parameters: 'block' and 'context'. 
-                                               The 'block' is the block which 
-                                               the new gradient operator will 
-                                               be added to. The 'context' is a 
-                                               map, whose keys are gradient 
-                                               variable names and values are 
+        callbacks(list[callable object]|None): The callbacks are used for
+                                               doing some custom jobs during
+                                               backward part building. All
+                                               callable objects in it will
+                                               be invoked once each time a
+                                               new gradient operator is added
+                                               into the program. The callable
+                                               object must has two input
+                                               parameters: 'block' and 'context'.
+                                               The 'block' is the block which
+                                               the new gradient operator will
+                                               be added to. The 'context' is a
+                                               map, whose keys are gradient
+                                               variable names and values are
                                               corresponding original variables.
-                                               In addition to this, the 'context' 
-                                               has another special key-value pair: 
-                                               the key is string '__current_op_desc__' 
-                                               and the value is the op_desc of the 
-                                               gradient operator who has just 
-                                               triggered the callable object. 
+                                               In addition to this, the 'context'
+                                               has another special key-value pair:
+                                               the key is string '__current_op_desc__'
+                                               and the value is the op_desc of the
+                                               gradient operator who has just
+                                               triggered the callable object.

    Returns:
-        list[(Variable,Variable)]: Pairs of parameter and its 
-        corresponding gradients. The key is the parameter and the 
+        list[(Variable,Variable)]: Pairs of parameter and its
+        corresponding gradients. The key is the parameter and the
        value is gradient variable.

    Raises:
@@ -535,7 +553,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
        no_grad_set = set()
    no_grad_set = copy.copy(no_grad_set)
    no_grad_dict = _get_stop_gradients_(program)
-    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))

    grad_info_map = dict()
    root_block = program.block(0)
@@ -558,7 +576,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,

    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
    op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
-    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))

    _append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
                          grad_to_var, callbacks)
@@ -697,7 +715,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
        no_grad_set = set()
    no_grad_set = copy.copy(no_grad_set)
    no_grad_dict = _get_stop_gradients_(prog)
-    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))

    fwd_op_num = block.desc.op_size()

@@ -731,7 +749,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):

    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
    op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
-    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
    grad_to_var = dict()
    grad_info_map = dict()
    _append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -13,10 +13,11 @@
 # limitations under the License.

 import copy
+import six

 import functools
-import layers
-import framework
+from . import layers
+from . import framework
 from . import core

 __all__ = [
@@ -80,8 +81,7 @@ def error_clip_callback(block, context):
    # the context is a grad_to_var map
    grad_to_var = context
    op_desc = block.desc.op(block.desc.op_size() - 1)
-    for grad_n in filter(lambda n: grad_to_var.has_key(n),
-                         op_desc.output_arg_names()):
+    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
        fwd_var = block._var_recursive(grad_to_var[grad_n])
        error_clip = getattr(fwd_var, "error_clip", None)
        if not (error_clip is None or isinstance(error_clip,
@@ -247,8 +247,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
    """

    def __init__(self, clip_norm, group_name="default_group"):
-        if not isinstance(group_name, basestring):
-            raise TypeError("'group_name' must be a basestring.")
+        if not isinstance(group_name, six.string_types):
+            raise TypeError("'group_name' must be a %s." % (six.string_types))

        self.clip_norm = clip_norm
        self.group_name = group_name
@@ -284,7 +284,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
                x=clip_var,
                y=layers.elementwise_max(
                    x=clip_var, y=group_norm_var))
-            assert group_scale_var.shape == (1L, )
+            assert group_scale_var.shape == (1, )
            self.context[group_scale_name] = group_scale_var

        new_grad = layers.elementwise_mul(
@@ -313,7 +313,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
        program = framework.default_main_program()
    if param_list is None:
        param_list = program.block(0).all_parameters()
-    if all(isinstance(elem, basestring) for elem in param_list):
+    if all(isinstance(elem, six.string_types) for elem in param_list):
        param_list = [program.block(0).var(elem) for elem in param_list]
    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
        raise TypeError(

--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from layers.control_flow import BlockGuard, equal
+from .layers.control_flow import BlockGuard, equal
 from .framework import Operator
-from layer_helper import LayerHelper, unique_name
-from layers import fill_constant
-import core
+from .layer_helper import LayerHelper, unique_name
+from .layers import fill_constant
+from . import core

 __all__ = [
    'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close',

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import decoder
-from decoder import *
-import memory_usage_calc
-from memory_usage_calc import *
+from . import decoder
+from .decoder import *
+from . import memory_usage_calc
+from .memory_usage_calc import *

 __all__ = decoder.__all__ + memory_usage_calc.__all__
--- a/python/paddle/fluid/contrib/decoder/__init__.py
+++ b/python/paddle/fluid/contrib/decoder/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import beam_search_decoder
-from beam_search_decoder import *
+from . import beam_search_decoder
+from .beam_search_decoder import *

 __all__ = beam_search_decoder.__all__
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -22,6 +22,7 @@ This API is still under active development and may change drastically.

 import contextlib
 import numpy as np
+import six

 from ... import layers
 from ...framework import Variable
@@ -191,7 +192,7 @@ class StateCell(object):
        self._helper = LayerHelper('state_cell', name=name)
        self._cur_states = {}
        self._state_names = []
-        for state_name, state in states.items():
+        for state_name, state in six.iteritems(states):
            if not isinstance(state, InitState):
                raise ValueError('state must be an InitState object.')
            self._cur_states[state_name] = state
@@ -346,7 +347,7 @@ class StateCell(object):
        if self._in_decoder and not self._switched_decoder:
            self._switch_decoder()

-        for input_name, input_value in inputs.items():
+        for input_name, input_value in six.iteritems(inputs):
            if input_name not in self._inputs:
                raise ValueError('Unknown input %s. '
                                 'Please make sure %s in input '
@@ -361,7 +362,7 @@ class StateCell(object):
        if self._in_decoder and not self._switched_decoder:
            self._switched_decoder()

-        for state_name, decoder_state in self._states_holder.items():
+        for state_name, decoder_state in six.iteritems(self._states_holder):
            if id(self._cur_decoder_obj) not in decoder_state:
                raise ValueError('Unknown decoder object, please make sure '
                                 'switch_decoder been invoked.')
@@ -671,7 +672,7 @@ class BeamSearchDecoder(object):
            feed_dict = {}
            update_dict = {}

-            for init_var_name, init_var in self._input_var_dict.items():
+            for init_var_name, init_var in six.iteritems(self._input_var_dict):
                if init_var_name not in self.state_cell._inputs:
                    raise ValueError('Variable ' + init_var_name +
                                     ' not found in StateCell!\n')
@@ -721,7 +722,8 @@ class BeamSearchDecoder(object):
                    self.state_cell.update_states()
                    self.update_array(prev_ids, selected_ids)
                    self.update_array(prev_scores, selected_scores)
-                    for update_name, var_to_update in update_dict.items():
+                    for update_name, var_to_update in six.iteritems(
+                            update_dict):
                        self.update_array(var_to_update, feed_dict[update_name])

    def read_array(self, init, is_ids=False, is_scores=False):

--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from __future__ import print_function
-import core
+from . import core
 import numpy
 import os
-import six.moves as six
+import six
+from six.moves import zip, range, xrange
 import multiprocessing

-from framework import Variable, default_main_program
+from .framework import Variable, default_main_program

 __all__ = ['DataFeeder']

@@ -53,7 +53,7 @@ class DataToLoDTensorConverter(object):
        self.data = []
        self.lod = []

-        for i in six.range(lod_level):
+        for i in six.moves.range(lod_level):
            self.lod.append([])

    def feed(self, data):
@@ -142,7 +142,7 @@ class DataFeeder(object):
        if program is None:
            program = default_main_program()
        for each_var in feed_list:
-            if isinstance(each_var, basestring):
+            if isinstance(each_var, six.string_types):
                each_var = program.block(0).var(each_var)
            if not isinstance(each_var, Variable):
                raise TypeError("Feed list should contain a list of variable")
@@ -174,7 +174,7 @@ class DataFeeder(object):
            dict: the result of conversion.
        """
        converter = []
-        for lod_level, shape, dtype in six.zip(
+        for lod_level, shape, dtype in six.moves.zip(
                self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
            converter.append(
                DataToLoDTensorConverter(
@@ -187,10 +187,12 @@ class DataFeeder(object):
            assert len(each_sample) == len(converter), (
                "The number of fields in data (%s) does not match " +
                "len(feed_list) (%s)") % (len(each_sample), len(converter))
-            for each_converter, each_slot in six.zip(converter, each_sample):
+            for each_converter, each_slot in six.moves.zip(converter,
+                                                           each_sample):
                each_converter.feed(each_slot)
        ret_dict = {}
-        for each_name, each_converter in six.zip(self.feed_names, converter):
+        for each_name, each_converter in six.moves.zip(self.feed_names,
+                                                       converter):
            ret_dict[each_name] = each_converter.done()
        return ret_dict

@@ -212,12 +214,14 @@ class DataFeeder(object):
        if isinstance(self.place, core.CUDAPlace):
            places = [
                core.CUDAPlace(i)
-                for i in six.xrange(self._get_number_of_places_(num_places))
+                for i in six.moves.xrange(
+                    self._get_number_of_places_(num_places))
            ]
        else:
            places = [
                core.CPUPlace()
-                for _ in six.xrange(self._get_number_of_places_(num_places))
+                for _ in six.moves.xrange(
+                    self._get_number_of_places_(num_places))
            ]

        if len(iterable) != len(places):
@@ -227,7 +231,7 @@ class DataFeeder(object):
                             "must be same.")

        place = self.place
-        for p, batch in six.zip(places, iterable):
+        for p, batch in six.moves.zip(places, iterable):
            self.place = p
            yield self.feed(batch)
        self.place = place

--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -14,8 +14,8 @@

 import sys
 import re
-from graphviz import GraphPreviewGenerator
-import proto.framework_pb2 as framework_pb2
+from .graphviz import GraphPreviewGenerator
+from .proto import framework_pb2
 from google.protobuf import text_format

 _vartype2str_ = [

--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -15,11 +15,11 @@
 import warnings
 import numpy as np

-import layers
-from framework import Program, Variable, program_guard
-import unique_name
-from layer_helper import LayerHelper
-from initializer import Constant
+from . import layers
+from .framework import Program, Variable, program_guard
+from . import unique_name
+from .layer_helper import LayerHelper
+from .initializer import Constant

 __all__ = [
    'ChunkEvaluator',

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -14,7 +14,8 @@

 import numpy as np
 import contextlib
-from framework import Program, default_main_program, Variable
+import six
+from .framework import Program, default_main_program, Variable
 from . import core

 __all__ = [
@@ -204,19 +205,19 @@ def fetch_var(name, scope=None, return_numpy=True):


 def _get_program_cache_key(feed, fetch_list):
-    feed_var_names = feed.keys()
+    feed_var_names = list(feed.keys())

    def to_name_str(var):
        if isinstance(var, Variable):
            return var.desc.name()
        elif isinstance(var, str):
            return var
-        elif isinstance(var, basestring):
+        elif isinstance(var, six.string_types):
            return str(var)
        else:
            raise TypeError(str(var) + " should be Variable or str")

-    fetch_var_names = map(to_name_str, fetch_list)
+    fetch_var_names = list(map(to_name_str, fetch_list))

    return str(feed_var_names + fetch_var_names)

@@ -229,8 +230,8 @@ class Executor(object):
    to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
    the variables(or names) that user want to get after program run. Note: the executor will run all
    operators in the program but not only the operators dependent by the fetch_list.
-    It store the global variables into the global scope, and create a local scope for the temporary 
-    variables. The local scope contents will be discarded after every minibatch forward/backward finished. 
+    It store the global variables into the global scope, and create a local scope for the temporary
+    variables. The local scope contents will be discarded after every minibatch forward/backward finished.
    But the global scope variables will be persistent through different runs.
    All of ops in program will be running in sequence.

@@ -345,7 +346,7 @@ class Executor(object):
    def _fetch_data(self, fetch_list, fetch_var_name, scope):
        outs = [
            core.get_fetch_variable(scope, fetch_var_name, i)
-            for i in xrange(len(fetch_list))
+            for i in range(len(fetch_list))
        ]
        return outs


--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -15,21 +15,22 @@
 import collections
 import contextlib
 import re
+import six

 import numpy as np

-import proto.framework_pb2 as framework_pb2
+from .proto import framework_pb2
 try:
    from . import core
-except ImportError, e:
+except ImportError as e:
    raise ImportError(
        """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
    if you encounters \"libmkldnn.so not found\" errors. If you have python
    installed in other directory, replace \"/usr/local/lib\" with your own
    directory. The original error is: \n""" + e.message)
-except Exception, e:
+except Exception as e:
    raise e
-import unique_name
+from . import unique_name

 __all__ = [
    'Program',
@@ -86,7 +87,7 @@ def convert_np_dtype_to_dtype_(np_dtype):
    elif dtype == np.uint8:
        return core.VarDesc.VarType.UINT8
    else:
-        raise ValueError("Not supported numpy dtype " + str(dtype))
+        raise ValueError("Not supported numpy dtype " + six.binary_type(dtype))


 def dtype_is_floating(dtype):
@@ -129,15 +130,15 @@ def _debug_string_(proto, throw_on_error=True):

 class Variable(object):
    """
-    In Fluid, every input and output of an operator is a variable. In most 
-    cases, variables are used for holding different kinds of data or training 
-    labels. A variable belongs to a block. All variable has its own name and 
+    In Fluid, every input and output of an operator is a variable. In most
+    cases, variables are used for holding different kinds of data or training
+    labels. A variable belongs to a block. All variable has its own name and
    two variables in different blocks could have the same name.

-    There are many kinds of variables. Each kind of them has its own attributes 
-    and usages. Please reference the framework.proto for details. 
+    There are many kinds of variables. Each kind of them has its own attributes
+    and usages. Please reference the framework.proto for details.

-    Most of a Variable's member variables can be setted to be None. It mean 
+    Most of a Variable's member variables can be setted to be None. It mean
    it is not available or will be specified later.

    Args:
@@ -197,6 +198,7 @@ class Variable(object):
        if name is None:
            name = unique_name.generate('_generated_var')
        is_new_var = False
+        name = name if isinstance(name, six.binary_type) else name.encode()
        self.desc = self.block.desc.find_var(name)

        if self.desc is None:
@@ -290,13 +292,13 @@ class Variable(object):
        assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                               bool)
        protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.VarDesc.FromString(str(protostr))
+        proto = framework_pb2.VarDesc.FromString(six.binary_type(protostr))
        res_str = _debug_string_(proto, throw_on_error)
        if with_details:
            additional_attr = ("error_clip", "stop_gradient")
            for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         str(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name, six.binary_type(getattr(self, attr_name)))
        return res_str

    __repr__ = __str__
@@ -369,7 +371,7 @@ def get_all_op_protos():
    protostrs = core.get_all_op_protos()
    ret_values = []
    for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
        ret_values.append(op_proto)
    return ret_values

@@ -472,7 +474,6 @@ class Operator(object):
                 inputs=None,
                 outputs=None,
                 attrs=None):
-
        self.block = block
        self.desc = desc
        self.attrs = attrs
@@ -523,10 +524,19 @@ class Operator(object):
                            % (in_proto.name, len(in_args)))
                    in_arg_names = []
                    for arg in in_args:
-                        if isinstance(arg, basestring):
+                        if isinstance(arg, six.string_types):
                            in_arg_names.append(arg)
+                        elif isinstance(arg, six.binary_type):
+                            in_arg_names.append(arg.decode())
                        else:
-                            in_arg_names.append(arg.name)
+                            if isinstance(arg.name, six.string_types):
+                                in_arg_names.append(arg.name)
+                            elif isinstance(arg.name, six.binary_type):
+                                in_arg_names.append(arg.name.decode())
+                            else:
+                                raise TypeError(
+                                    "arguments require unicode, str or bytes, but get %s instead."
+                                    % (type(arg.name)))
                    self.desc.set_input(in_proto.name, in_arg_names)
                else:
                    self.desc.set_input(in_proto.name, [])
@@ -541,8 +551,9 @@ class Operator(object):
            if not given == need:
                raise ValueError(("Incorrect setting for output(s) of "
                                  "operator \"%s\". Need: [%s] Given: [%s]") %
-                                 (type, ", ".join(str(e) for e in need),
-                                  ", ".join(str(e) for e in given)))
+                                 (type,
+                                  ", ".join(six.binary_type(e) for e in need),
+                                  ", ".join(six.binary_type(e) for e in given)))

            for out_proto in proto.outputs:
                out_args = outputs[out_proto.name]
@@ -554,7 +565,14 @@ class Operator(object):
                        (out_proto.name, len(out_args)))
                out_arg_names = []
                for arg in out_args:
-                    out_arg_names.append(arg.name)
+                    if isinstance(arg.name, six.string_types):
+                        out_arg_names.append(arg.name)
+                    elif isinstance(arg.name, six.binary_type):
+                        out_arg_names.append(arg.name.decode())
+                    else:
+                        raise TypeError(
+                            "arguments require unicode, str or bytes, but get %s instead."
+                            % (type(arg.name)))
                    arg.op = self
                self.desc.set_output(out_proto.name, out_arg_names)

@@ -590,7 +608,7 @@ class Operator(object):

        """
        protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.OpDesc.FromString(str(protostr))
+        proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
        return _debug_string_(proto, throw_on_error)

    def __str__(self):
@@ -845,7 +863,7 @@ class Block(object):
            re_add_indent = re.compile(r"\n(.)")
            res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
                self.idx, self.parent_idx)
-            for var in self.vars.itervalues():
+            for var in list(self.vars.values()):
                res_str += "\n  vars {\n    %s  }" % re_add_indent.sub(
                    r"\n    \1", var.to_string(throw_on_error, with_details))
            for op in self.ops:
@@ -854,7 +872,8 @@ class Block(object):
            res_str += "\n}"
        else:
            protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.BlockDesc.FromString(str(protostr))
+            proto = framework_pb2.BlockDesc.FromString(
+                six.binary_type(protostr))
            res_str = _debug_string_(proto, throw_on_error)
        return res_str

@@ -898,10 +917,11 @@ class Block(object):
        Returns:
            Variable: the Variable with the giving name.
        """
-        if not isinstance(name, basestring):
-            raise TypeError(
-                "var require string as parameter, but get %s instead." %
-                (type(name)))
+        if not isinstance(name, six.string_types):
+            if not isinstance(name, six.binary_type):
+                raise TypeError(
+                    "var require string as parameter, but get %s instead." %
+                    (type(name)))
        v = self.vars.get(name, None)
        if v is None:
            raise ValueError("var %s not in this block" % name)
@@ -949,10 +969,10 @@ class Block(object):
        raise ValueError("Var {0} is not found recursively".format(name))

    def all_parameters(self):
-        return list(self._iter_parameters())
+        return list(self.iter_parameters())

-    def _iter_parameters(self):
-        return (item[1] for item in self.vars.iteritems()
+    def iter_parameters(self):
+        return (item[1] for item in list(self.vars.items())
                if isinstance(item[1], Parameter))

    def create_var(self, *args, **kwargs):
@@ -1132,7 +1152,7 @@ class Block(object):
                self.create_var(name=var.name(), desc=var, type=var.type())

        # sync variables removed from c++ end
-        for var in self.vars.keys():
+        for var in list(self.vars.keys()):
            if not self.desc.find_var(var):
                self.vars.pop(var)

@@ -1204,7 +1224,7 @@ class Block(object):
        if not isinstance(other, Block):
            raise TypeError(
                "_copy_param_info_from should be invoked with Block")
-        for p in other._iter_parameters():
+        for p in other.iter_parameters():
            assert isinstance(p, Parameter)
            v = self.vars.get(p.name, None)
            if v is None:
@@ -1403,7 +1423,8 @@ class Program(object):
                res_str += block.to_string(throw_on_error, with_details)
        else:
            protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.ProgramDesc.FromString(str(protostr))
+            proto = framework_pb2.ProgramDesc.FromString(
+                six.binary_type(protostr))
            res_str = _debug_string_(proto, throw_on_error)
        return res_str

@@ -1501,7 +1522,7 @@ class Program(object):
        else:
            p = Program()
            p.desc = core.ProgramDesc(self.desc)
-            p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+            p.blocks = [Block(p, i) for i in range(self.desc.num_blocks())]
            p._sync_with_cpp()

        p._copy_param_info_from(self)
@@ -1553,7 +1574,7 @@ class Program(object):
            targets_idx.append([t.block.idx, t.idx])
        res = Program()
        res.desc = core.prune(self.desc, targets_idx)
-        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
        res._sync_with_cpp()
        return res

@@ -1564,7 +1585,7 @@ class Program(object):

        2. Remove the :code:`read_op` if exists.

-        3. change the :code:`is_test` 
+        3. change the :code:`is_test`
        attribute of operators to :code:`True`. All the :code:`Parameter`
        information will be lost.

@@ -1594,13 +1615,13 @@ class Program(object):
                root_block._remove_var(var.name())

        # change all `is_test` attributes to True
-        for i in xrange(res.desc.num_blocks()):
+        for i in range(res.desc.num_blocks()):
            block = res.desc.block(i)
-            for j in xrange(block.op_size()):
+            for j in range(block.op_size()):
                op = block.op(j)
                if op.has_attr('is_test'):
                    op.set_attr('is_test', True)
-        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
        res._sync_with_cpp()
        return res

@@ -1613,14 +1634,14 @@ class Program(object):
        and deserialization.

        Args:
-            binary_str(str): The binary prootbuf string.
+            binary_str_type(str): The binary prootbuf string.

        Returns:
            Program: A deserialized program desc.
        """
        p = Program()
        p.desc = core.ProgramDesc(binary_str)
-        p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
+        p.blocks = [Block(p, i) for i in range(p.desc.num_blocks())]
        p._sync_with_cpp()
        return p

@@ -1648,7 +1669,7 @@ class Program(object):
        self._seed = seed

    def __repr__(self):
-        return str(self)
+        return self.__str__()

    def global_block(self):
        """
@@ -1759,7 +1780,7 @@ class Program(object):
        if len(self.blocks) != len(other.blocks):
            raise ValueError("_copy_param_info_from should be invoked with two "
                             "program, with represent the same topology")
-        for var in other.global_block().vars.itervalues():
+        for var in list(other.global_block().vars.values()):
            if var.is_data:
                self.global_block().var(var.name).is_data = True

@@ -1771,15 +1792,15 @@ class Program(object):
            iterable: The generator will yield every variable in this program.
        """
        for each_block in self.blocks:
-            for each_var in each_block.vars.itervalues():
+            for each_var in list(each_block.vars.values()):
                yield each_var


 class Parameter(Variable):
    """
-    Parameter is derived from Variable. A parameter is a persistable 
+    Parameter is derived from Variable. A parameter is a persistable
    Variable, and will be updated by optimizers after each iteration.
-    The training of a neural network is essentially the updating of 
+    The training of a neural network is essentially the updating of
    its parameters.

    Relative to a general Variable, a Parameter has several its own
@@ -1845,8 +1866,8 @@ class Parameter(Variable):
            additional_attr = ("trainable", "optimize_attr", "regularizer",
                               "gradient_clip_attr", "do_model_average")
            for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         str(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name, six.binary_type(getattr(self, attr_name)))
        else:
            res_str = Variable.to_string(self, throw_on_error, False)
        return res_str

--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -14,12 +14,13 @@

 import os
 import random
+import six
 import subprocess
 import logging


 def crepr(v):
-    if type(v) is str or type(v) is unicode:
+    if isinstance(v, six.string_types):
        return '"%s"' % v
    return str(v)

@@ -104,7 +105,7 @@ class Graph(object):

    def _rank_repr(self):
        ranks = sorted(
-            self.rank_groups.items(),
+            list(self.rank_groups.items()),
            cmp=lambda a, b: a[1].priority > b[1].priority)
        repr = []
        for x in ranks:
@@ -148,7 +149,7 @@ class Node(object):
            name=self.name,
            label=self.label,
            extra=',' + ','.join("%s=%s" % (key, crepr(value))
-                                 for key, value in self.attrs.items())
+                                 for key, value in list(self.attrs.items()))
            if self.attrs else "")
        return reprs

@@ -172,7 +173,7 @@ class Edge(object):
            target=self.target.name,
            extra="" if not self.attrs else
            "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
-                           for attr in self.attrs.items()) + "]")
+                           for attr in list(self.attrs.items())) + "]")
        return repr



--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -14,14 +14,14 @@

 import contextlib

-import core
-
-import executor
-import framework
-import io
-import parallel_executor
-import unique_name
-from trainer import check_and_get_place
+from . import core
+
+from . import executor
+from . import framework
+from . import io
+from . import parallel_executor
+from . import unique_name
+from .trainer import check_and_get_place

 __all__ = ['Inferencer', ]


--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import framework
+from . import framework
 import numpy as np
 import contextlib
-from framework import convert_np_dtype_to_dtype_
-from core import VarDesc
+from .framework import convert_np_dtype_to_dtype_
+from .core import VarDesc

 __all__ = [
    'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -16,6 +16,7 @@ import os
 import errno
 import time
 import shutil
+import six

 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable
@@ -92,34 +93,34 @@ def save_vars(executor,
    """
    Save variables to the given directory by executor.

-    There are two ways to specify variables to be saved: The first way, list 
-    variables in a list and assign it to the `vars`. The second way, assign the 
-    `main_program` with an existing program, then all variables in the program 
-    will be saved. The first way has a higher priority. In other words, if `vars` 
+    There are two ways to specify variables to be saved: The first way, list
+    variables in a list and assign it to the `vars`. The second way, assign the
+    `main_program` with an existing program, then all variables in the program
+    will be saved. The first way has a higher priority. In other words, if `vars`
    are assigned, the `main_program` and the `predicate` will be ignored.

-    The `dirname` are used to specify the folder where to save variables. 
-    If you prefer to save variables in separate files in the folder `dirname`, 
-    set `filename` None; if you prefer to save all variables in a single file, 
+    The `dirname` are used to specify the folder where to save variables.
+    If you prefer to save variables in separate files in the folder `dirname`,
+    set `filename` None; if you prefer to save all variables in a single file,
    use `filename` to specify it.

    Args:
        executor(Executor): The executor to run for saving variables.
        dirname(str): The directory path.
-        main_program(Program|None): The program whose variables will be saved. 
-                                    If it is None, the default main program will 
+        main_program(Program|None): The program whose variables will be saved.
+                                    If it is None, the default main program will
                                    be used automatically.
                                    Default: None
-        vars(list[Variable]|None): The list that contains all variables to save. 
+        vars(list[Variable]|None): The list that contains all variables to save.
                                   It has a higher priority than the `main_program`.
                                   Default: None
-        predicate(function|None): If it is not None, only variables in the 
-                                  `main_program` that makes predicate(variable)==True 
-                                  will be saved. It only works when we are using the 
-                                  `main_program` to specify variables (In other words 
+        predicate(function|None): If it is not None, only variables in the
+                                  `main_program` that makes predicate(variable)==True
+                                  will be saved. It only works when we are using the
+                                  `main_program` to specify variables (In other words
                                  `vars` is None).
                                  Default: None
-        filename(str|None): The file which to save all variables. If you prefer to save 
+        filename(str|None): The file which to save all variables. If you prefer to save
                            variables separately, set it to None.
                            Default: None

@@ -149,7 +150,7 @@ def save_vars(executor,

            # The second usage: using `vars` to specify variables
            var_list = [var_a, var_b, var_c]
-            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list, 
+            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list,
                               filename="vars_file")
            # var_a, var_b and var_c will be saved. And they are going to be
            # saved in the same file named 'var_file' in the path "./my_paddle_model".
@@ -163,7 +164,7 @@ def save_vars(executor,
        save_vars(
            executor,
            dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()),
+            vars=list(filter(predicate, main_program.list_vars())),
            filename=filename)
    else:
        save_program = Program()
@@ -203,14 +204,14 @@ def save_params(executor, dirname, main_program=None, filename=None):
    This function filters out all parameters from the give `main_program`
    and then save them to the folder `dirname` or the file `filename`.

-    Use the `dirname` to specify the saving folder. If you would like to 
-    save parameters in separate files, set `filename` None; if you would 
-    like to save all parameters in a single file, use `filename` to specify 
+    Use the `dirname` to specify the saving folder. If you would like to
+    save parameters in separate files, set `filename` None; if you would
+    like to save all parameters in a single file, use `filename` to specify
    the file name.

-    NOTICE: Some variables are not Parameter while they are necessary for 
-    training. So you can NOT save and continue your training just by 
-    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    NOTICE: Some variables are not Parameter while they are necessary for
+    training. So you can NOT save and continue your training just by
+    `save_params()` and `load_params()`. Please use `save_persistables()`
    and `load_persistables()` instead.

    Args:
@@ -220,8 +221,8 @@ def save_params(executor, dirname, main_program=None, filename=None):
                                    saved. If it is None, the default
                                    main program will be used automatically.
                                    Default: None
-        filename(str|None): The file to save all parameters. If you prefer 
-                            to save parameters in differnet files, set it 
+        filename(str|None): The file to save all parameters. If you prefer
+                            to save parameters in differnet files, set it
                            to None.
                            Default: None

@@ -234,7 +235,7 @@ def save_params(executor, dirname, main_program=None, filename=None):
            exe = fluid.Executor(fluid.CPUPlace())
            param_path = "./my_paddle_model"
            prog = fluid.default_main_program()
-            fluid.io.save_params(executor=exe, dirname=param_path, 
+            fluid.io.save_params(executor=exe, dirname=param_path,
                                 main_program=None)
    """
    save_vars(
@@ -248,23 +249,23 @@ def save_params(executor, dirname, main_program=None, filename=None):

 def save_persistables(executor, dirname, main_program=None, filename=None):
    """
-    This function filters out all variables with `persistable==True` from the 
-    give `main_program` and then saves these variables to the folder `dirname` 
+    This function filters out all variables with `persistable==True` from the
+    give `main_program` and then saves these variables to the folder `dirname`
    or file `filename`.

-    The `dirname` is used to specify the folder where persistable variables 
-    are going to be saved. If you would like to save variables in separate 
-    files, set `filename` None; if you would like to save all variables in a 
+    The `dirname` is used to specify the folder where persistable variables
+    are going to be saved. If you would like to save variables in separate
+    files, set `filename` None; if you would like to save all variables in a
    single file, use `filename` to specify the file name.

    Args:
        executor(Executor): The executor to run for saving persistable variables.
        dirname(str): The directory path.
-        main_program(Program|None): The program whose persistbale variables will 
-                                    be saved. If it is None, the default main 
+        main_program(Program|None): The program whose persistbale variables will
+                                    be saved. If it is None, the default main
                                    program will be used automatically.
                                    Default: None
-        filename(str|None): The file to saved all variables. If you prefer to 
+        filename(str|None): The file to saved all variables. If you prefer to
                            save variables in differnet files, set it to None.
                            Default: None

@@ -277,7 +278,7 @@ def save_persistables(executor, dirname, main_program=None, filename=None):
            exe = fluid.Executor(fluid.CPUPlace())
            param_path = "./my_paddle_model"
            prog = fluid.default_main_program()
-            fluid.io.save_persistables(executor=exe, dirname=param_path, 
+            fluid.io.save_persistables(executor=exe, dirname=param_path,
                                       main_program=None)
    """
    save_vars(
@@ -298,34 +299,34 @@ def load_vars(executor,
    """
    Load variables from the given directory by executor.

-    There are two ways to specify variables to be loaded: The first way, list 
-    variables in a list and assign it to the `vars`. The second way, assign the 
-    `main_program` with an existing program, then all variables in the program 
-    will be loaded. The first way has a higher priority. In other words if `vars` 
+    There are two ways to specify variables to be loaded: The first way, list
+    variables in a list and assign it to the `vars`. The second way, assign the
+    `main_program` with an existing program, then all variables in the program
+    will be loaded. The first way has a higher priority. In other words if `vars`
    are assigned, the `main_program` and the `predicate` will be ignored.

-    The `dirname` are used to specify the folder where to load variables. 
-    If variables were saved in separate files in the folder `dirname`, 
-    set `filename` None; if all variables were saved in a single file, 
+    The `dirname` are used to specify the folder where to load variables.
+    If variables were saved in separate files in the folder `dirname`,
+    set `filename` None; if all variables were saved in a single file,
    use `filename` to specify it.

    Args:
        executor(Executor): The executor to run for loading variables.
        dirname(str): The directory path.
-        main_program(Program|None): The program whose variables will be loaded. 
-                                    If it is None, the default main program will 
+        main_program(Program|None): The program whose variables will be loaded.
+                                    If it is None, the default main program will
                                    be used automatically.
                                    Default: None
-        vars(list[Variable]|None): The list that contains all variables to load. 
+        vars(list[Variable]|None): The list that contains all variables to load.
                                   It has a higher priority than the `main_program`.
                                   Default: None
-        predicate(function|None): If it is not None, only variables in the 
-                                  `main_program` that makes predicate(variable)==True 
-                                  will be loaded. It only works when we are using the 
-                                  `main_program` to specify variables (In other words 
+        predicate(function|None): If it is not None, only variables in the
+                                  `main_program` that makes predicate(variable)==True
+                                  will be loaded. It only works when we are using the
+                                  `main_program` to specify variables (In other words
                                  `vars` is None).
                                  Default: None
-        filename(str|None): The file which saved all required variables. If variables 
+        filename(str|None): The file which saved all required variables. If variables
                            were saved in differnet files, set it to None.
                            Default: None

@@ -355,9 +356,9 @@ def load_vars(executor,

            # The second usage: using `vars` to specify variables
            var_list = [var_a, var_b, var_c]
-            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list, 
+            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list,
                               filename="vars_file")
-            # var_a, var_b and var_c will be loaded. And they are supposed to haven 
+            # var_a, var_b and var_c will be loaded. And they are supposed to haven
            # been saved in the same file named 'var_file' in the path "./my_paddle_model".
    """
    if vars is None:
@@ -369,7 +370,7 @@ def load_vars(executor,
        load_vars(
            executor,
            dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()),
+            vars=list(filter(predicate, main_program.list_vars())),
            filename=filename)
    else:
        load_prog = Program()
@@ -410,15 +411,15 @@ def load_params(executor, dirname, main_program=None, filename=None):
    and then trys to load these parameters from the folder `dirname` or
    the file `filename`.

-    Use the `dirname` to specify the folder where parameters were saved. If 
-    parameters were saved in separate files in the folder `dirname`, set 
-    `filename` None; if all parameters were saved in a single file, use 
+    Use the `dirname` to specify the folder where parameters were saved. If
+    parameters were saved in separate files in the folder `dirname`, set
+    `filename` None; if all parameters were saved in a single file, use
    `filename` to specify the file name.

-    NOTICE: Some variables are not Parameter while they are necessary for 
-    training. So you can NOT save and continue your training just by 
-    `save_params()` and `load_params()`. Please use `save_persistables()` 
-    and `load_persistables()` instead. 
+    NOTICE: Some variables are not Parameter while they are necessary for
+    training. So you can NOT save and continue your training just by
+    `save_params()` and `load_params()`. Please use `save_persistables()`
+    and `load_persistables()` instead.

    Args:
        executor(Executor): The executor to run for loading parameters.
@@ -427,7 +428,7 @@ def load_params(executor, dirname, main_program=None, filename=None):
                                    loaded. If it is None, the default
                                    main program will be used automatically.
                                    Default: None
-        filename(str|None): The file which saved all parameters. If parameters 
+        filename(str|None): The file which saved all parameters. If parameters
                            were saved in differnet files, set it to None.
                            Default: None

@@ -440,7 +441,7 @@ def load_params(executor, dirname, main_program=None, filename=None):
            exe = fluid.Executor(fluid.CPUPlace())
            param_path = "./my_paddle_model"
            prog = fluid.default_main_program()
-            fluid.io.load_params(executor=exe, dirname=param_path, 
+            fluid.io.load_params(executor=exe, dirname=param_path,
                                main_program=None)
    """
    load_vars(
@@ -453,23 +454,23 @@ def load_params(executor, dirname, main_program=None, filename=None):

 def load_persistables(executor, dirname, main_program=None, filename=None):
    """
-    This function filters out all variables with `persistable==True` from the 
-    give `main_program` and then trys to load these variables from the folder 
+    This function filters out all variables with `persistable==True` from the
+    give `main_program` and then trys to load these variables from the folder
    `dirname` or the file `filename`.

-    Use the `dirname` to specify the folder where persistable variables were 
-    saved. If variables were saved in separate files, set `filename` None; 
-    if all variables were saved in a single file, use `filename` to specify 
+    Use the `dirname` to specify the folder where persistable variables were
+    saved. If variables were saved in separate files, set `filename` None;
+    if all variables were saved in a single file, use `filename` to specify
    the file name.

    Args:
        executor(Executor): The executor to run for loading persistable variables.
        dirname(str): The directory path.
-        main_program(Program|None): The program whose persistbale variables will 
-                                    be loaded. If it is None, the default main 
+        main_program(Program|None): The program whose persistbale variables will
+                                    be loaded. If it is None, the default main
                                    program will be used automatically.
                                    Default: None
-        filename(str|None): The file which saved all variables. If variables were 
+        filename(str|None): The file which saved all variables. If variables were
                            saved in differnet files, set it to None.
                            Default: None

@@ -482,7 +483,7 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
            exe = fluid.Executor(fluid.CPUPlace())
            param_path = "./my_paddle_model"
            prog = fluid.default_main_program()
-            fluid.io.load_persistables(executor=exe, dirname=param_path, 
+            fluid.io.load_persistables(executor=exe, dirname=param_path,
                                       main_program=None)
    """
    load_vars(
@@ -561,20 +562,20 @@ def save_inference_model(dirname,

    Args:
        dirname(str): The directory path to save the inference model.
-        feeded_var_names(list[str]): Names of variables that need to be feeded data 
+        feeded_var_names(list[str]): Names of variables that need to be feeded data
                                     during inference.
-        target_vars(list[Variable]): Variables from which we can get inference 
+        target_vars(list[Variable]): Variables from which we can get inference
                                     results.
        executor(Executor): The executor that saves the inference model.
-        main_program(Program|None): The original program, which will be pruned to 
-                                    build the inference model. If is setted None, 
+        main_program(Program|None): The original program, which will be pruned to
+                                    build the inference model. If is setted None,
                                    the default main program will be used.
                                    Default: None.
-        model_filename(str|None): The name of file to save the inference program 
-                                  itself. If is setted None, a default filename 
+        model_filename(str|None): The name of file to save the inference program
+                                  itself. If is setted None, a default filename
                                  `__model__` will be used.
-        params_filename(str|None): The name of file to save all related parameters. 
-                                   If it is setted None, parameters will be saved 
+        params_filename(str|None): The name of file to save all related parameters.
+                                   If it is setted None, parameters will be saved
                                   in separate files .

    Returns:
@@ -592,20 +593,32 @@ def save_inference_model(dirname,
            fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'],
                         target_vars=[predict_var], executor=exe)

-            # In this exsample, the function will prune the default main program 
-            # to make it suitable for infering the `predict_var`. The pruned 
-            # inference program is going to be saved in the "./infer_model/__model__" 
+            # In this exsample, the function will prune the default main program
+            # to make it suitable for infering the `predict_var`. The pruned
+            # inference program is going to be saved in the "./infer_model/__model__"
            # and parameters are going to be saved in separate files under folder
-            # "./infer_model". 
+            # "./infer_model".

    """
-    if isinstance(feeded_var_names, basestring):
+    if isinstance(feeded_var_names, six.binary_type):
        feeded_var_names = [feeded_var_names]
+    elif isinstance(feeded_var_names, six.text_type):
+        feeded_var_names = [feeded_var_names.encode()]
    else:
        if len(feeded_var_names) > 0:
+            # TODO(paddle-dev): polish these code blocks
            if not (bool(feeded_var_names) and all(
-                    isinstance(name, basestring) for name in feeded_var_names)):
-                raise ValueError("'feed_var_names' should be a list of str.")
+                    isinstance(name, six.binary_type)
+                    for name in feeded_var_names)):
+                if not (all(
+                        isinstance(name, six.text_type)
+                        for name in feeded_var_names)):
+                    raise ValueError(
+                        "'feed_var_names' should be a list of str.")
+                else:
+                    feeded_var_names = [
+                        name.encode() for name in feeded_var_names
+                    ]

    if isinstance(target_vars, Variable):
        target_vars = [target_vars]
@@ -662,22 +675,22 @@ def load_inference_model(dirname,
        dirname(str): The directory path
        executor(Executor): The executor to run for loading inference model.
        model_filename(str|None): The name of file to load inference program.
-                                  If it is None, the default filename 
+                                  If it is None, the default filename
                                  '__model__' will be used.
                                  Default: None
        params_filename(str|None): The name of file to load all parameters.
-                                   It is only used for the case that all 
-                                   parameters were saved in a single binary 
-                                   file. If parameters were saved in separate 
+                                   It is only used for the case that all
+                                   parameters were saved in a single binary
+                                   file. If parameters were saved in separate
                                   files, set it as 'None'.

    Returns:
        tuple: The return of this function is a tuple with three elements:
-        (program, feed_target_names, fetch_targets). The `program` is a 
-        Program, it's the program for inference. The `feed_target_names` is 
-        a list of str, it contains Names of variables that need to feed 
-        data in the inference program. The `fetch_targets` is a list of 
-        Variable. It contains variables from which we can get inference 
+        (program, feed_target_names, fetch_targets). The `program` is a
+        Program, it's the program for inference. The `feed_target_names` is
+        a list of str, it contains Names of variables that need to feed
+        data in the inference program. The `fetch_targets` is a list of
+        Variable. It contains variables from which we can get inference
        results.

    Raises:
@@ -688,17 +701,17 @@ def load_inference_model(dirname,

            exe = fluid.Executor(fluid.CPUPlace())
            path = "./infer_model"
-            [inference_program, feed_target_names, fetch_targets] = 
+            [inference_program, feed_target_names, fetch_targets] =
                fluid.io.load_inference_model(dirname=path, executor=exe)
            results = exe.run(inference_program,
                          feed={feed_target_names[0]: tensor_img},
                          fetch_list=fetch_targets)

-            # In this exsample, the inference program was saved in the 
-            # "./infer_model/__model__" and parameters were saved in 
-            # separate files in ""./infer_model". 
-            # After getting inference program, feed target names and 
-            # fetch targets, we can use an Executor to run the inference 
+            # In this exsample, the inference program was saved in the
+            # "./infer_model/__model__" and parameters were saved in
+            # separate files in ""./infer_model".
+            # After getting inference program, feed target names and
+            # fetch targets, we can use an Executor to run the inference
            # program to get the inference result.

    """

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -14,12 +14,14 @@

 import copy
 import itertools
+import six

-from framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
-import unique_name
+from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
+from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
-from param_attr import ParamAttr, WeightNormParamAttr
-import core
+from .param_attr import ParamAttr, WeightNormParamAttr
+from . import core
+from six.moves import zip


 class LayerHelper(object):
@@ -83,7 +85,7 @@ class LayerHelper(object):
            raise ValueError("parameter number mismatch")
        elif len(param_attr) == 1 and length != 1:
            tmp = [None] * length
-            for i in xrange(length):
+            for i in range(length):
                tmp[i] = copy.deepcopy(param_attr[0])
            param_attr = tmp
        return param_attr
@@ -91,7 +93,7 @@ class LayerHelper(object):
    def iter_inputs_and_params(self, input_param_name='input'):
        inputs = self.multiple_input(input_param_name)
        param_attrs = self.multiple_param_attr(len(inputs))
-        for ipt, param_attr in itertools.izip(inputs, param_attrs):
+        for ipt, param_attr in zip(inputs, param_attrs):
            yield ipt, param_attr

    def input_dtype(self, input_param_name='input'):
@@ -218,7 +220,7 @@ class LayerHelper(object):
                norm = __norm_op(reshape, dim=0, block=block)
                __reshape_op(norm, out=out, shape=out_shape, block=block)
            else:
-                perm = range(len(x.shape))
+                perm = list(range(len(x.shape)))
                perm[0], perm[dim] = dim, 0
                transpose = __transpose_op(x, perm, block=block)
                norm = __norm_op(transpose, dim=0, block=block)
@@ -397,8 +399,10 @@ class LayerHelper(object):
        act = self.kwargs.get('act', None)
        if act is None:
            return input_var
-        if isinstance(act, basestring):
+        if isinstance(act, six.string_types):
            act = {'type': act}
+        else:
+            raise TypeError(str(act) + " should be unicode or str")

        if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
            act['use_cudnn'] = self.kwargs.get('use_cudnn')

--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -12,25 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import ops
-from ops import *
-import nn
-from nn import *
-import io
-from io import *
-import tensor
-from tensor import *
-import control_flow
-from control_flow import *
-import device
-from device import *
-import math_op_patch
-from math_op_patch import *
-import detection
-from detection import *
-import metric_op
-from metric_op import *
-from learning_rate_scheduler import *
+from . import ops
+from .ops import *
+from . import nn
+from .nn import *
+from . import io
+from .io import *
+from . import tensor
+from .tensor import *
+from . import control_flow
+from .control_flow import *
+from . import device
+from .device import *
+from . import math_op_patch
+from .math_op_patch import *
+from . import detection
+from .detection import *
+from . import metric_op
+from .metric_op import *
+from .learning_rate_scheduler import *

 __all__ = []
 __all__ += nn.__all__

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,15 +13,16 @@
 # limitations under the License.
 import contextlib

-from layer_function_generator import autodoc, templatedoc
-from tensor import assign, fill_constant
+from .layer_function_generator import autodoc, templatedoc
+from .tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
-from ops import logical_and, logical_not, logical_or
+from .ops import logical_and, logical_not, logical_or
 import numpy
 import warnings
+from functools import reduce

 __all__ = [
    'While',
@@ -276,7 +277,7 @@ class ParallelDo(object):
          avg_cost = fluid.layers.mean(x=cost)

    .. warning::
-    
+
       It will be soon deprecated, please use ParallelExecutor instead.
    """

@@ -601,7 +602,7 @@ class StaticRNN(object):
        boot_memories = []
        pre_memories = []
        memories = []
-        for _, mem in self.memories.iteritems():
+        for _, mem in list(self.memories.items()):
            boot_memories.append(mem.init)
            pre_memories.append(mem.pre_mem.name)
            mem_var = rnn_block.var(mem.mem.name)
@@ -819,21 +820,21 @@ def max_sequence_len(rank_table):


 def lod_tensor_to_array(x, table):
-    """ 
+    """
    Convert a LoDTensor to a LoDTensorArray.

-    This function split a LoDTesnor to a LoDTensorArray according to its LoD 
-    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in 
-    PaddlePaddle. The generated LoDTensorArray of this function can be further read 
-    or written by `read_from_array()` and `write_to_array()` operators. However, 
-    this function is generally an internal component of PaddlePaddle `DynamicRNN`. 
+    This function split a LoDTesnor to a LoDTensorArray according to its LoD
+    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in
+    PaddlePaddle. The generated LoDTensorArray of this function can be further read
+    or written by `read_from_array()` and `write_to_array()` operators. However,
+    this function is generally an internal component of PaddlePaddle `DynamicRNN`.
    Users should not use it directly.

    Args:
        x (Variable|list): The LoDTensor to be converted to a LoDTensorArray.
        table (ParamAttr|list): The variable that stores the level of lod
                                which is ordered by sequence length in
-                                descending order. It is generally generated 
+                                descending order. It is generally generated
                                by `layers.lod_rank_table()` API.

    Returns:
@@ -1067,9 +1068,9 @@ def array_read(array, i):
        Given:

        array = [0.6, 0.1, 0.3, 0.1]
-        
+
        And:
-        
+
        i = 2

        Then:
@@ -1176,9 +1177,9 @@ def array_length(array):

 class ConditionalBlockGuard(BlockGuard):
    """
-    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for 
-    holding a ConditionalBlock, and helping users entering and exiting the 
-    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard 
+    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for
+    holding a ConditionalBlock, and helping users entering and exiting the
+    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard
    is generally an internal component of IfElse, users should not use it directly.
    """

@@ -1512,7 +1513,7 @@ class IfElse(object):
    def __call__(self):
        if self.status != self.OUT_IF_ELSE_BLOCKS:
            raise ValueError("IfElse::__call__ must be out of sub-block")
-        false_len, true_len = map(len, self.output_table)
+        false_len, true_len = list(map(len, self.output_table))
        if false_len == 0 and true_len == 0:
            raise ValueError("Must invoke true_block/false_block before "
                             "__call__")
@@ -1932,7 +1933,7 @@ def is_empty(x, cond=None, **ignored):

    Args:
        x (Variable): The Variable to be tested.
-        cond (Variable|None): Output parameter. Returns the test result 
+        cond (Variable|None): Output parameter. Returns the test result
                              of given 'x'. Default: None

    Returns:

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -15,12 +15,13 @@
 All layers just related to the detection neural network.
 """

-from layer_function_generator import generate_layer_fn
-from layer_function_generator import autodoc, templatedoc
+from .layer_function_generator import generate_layer_fn
+from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
-import tensor
-import nn
+from . import tensor
+from . import nn
 import math
+from functools import reduce

 __all__ = [
    'prior_box',
@@ -1032,7 +1033,7 @@ def multi_box_head(inputs,
        min_sizes = []
        max_sizes = []
        step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
-        for ratio in xrange(min_ratio, max_ratio + 1, step):
+        for ratio in range(min_ratio, max_ratio + 1, step):
            min_sizes.append(base_size * ratio / 100.)
            max_sizes.append(base_size * (ratio + step) / 100.)
        min_sizes = [base_size * .10] + min_sizes

--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -15,7 +15,7 @@
 All util layers.
 """

-from layer_function_generator import autodoc
+from .layer_function_generator import autodoc
 from ..framework import unique_name
 from ..layer_helper import LayerHelper
 from ..annotations import deprecated

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -16,8 +16,8 @@ import multiprocessing
 import threading

 from ..data_feeder import DataFeeder
-from control_flow import BlockGuard
-from layer_function_generator import templatedoc
+from .control_flow import BlockGuard
+from .layer_function_generator import templatedoc
 from .. import core
 from ..executor import global_scope
 from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
@@ -69,7 +69,7 @@ def data(name,
    """
    helper = LayerHelper('data', **locals())
    shape = list(shape)
-    for i in xrange(len(shape)):
+    for i in range(len(shape)):
        if shape[i] is None:
            shape[i] = -1
            append_batch_size = False
@@ -387,9 +387,9 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
    Create a uniform random data generator

    This layer returns a Reader Variable.
-    Instead of opening a file and reading data from it, this 
-    Reader Variable generates float uniform random data by itself. 
-    It can be used as a dummy reader to test a network without 
+    Instead of opening a file and reading data from it, this
+    Reader Variable generates float uniform random data by itself.
+    It can be used as a dummy reader to test a network without
    opening a real file.

    Args:
@@ -707,9 +707,9 @@ def open_files(filenames,
    """
    Open files

-    This layer takes a list of files to read from and returns a Reader Variable. 
-    Via the Reader Variable, we can get data from given files. All files must 
-    have name suffixs to indicate their formats, e.g., '*.recordio'. 
+    This layer takes a list of files to read from and returns a Reader Variable.
+    Via the Reader Variable, we can get data from given files. All files must
+    have name suffixs to indicate their formats, e.g., '*.recordio'.

    Args:
       filenames(list): The list of file names.
@@ -825,9 +825,9 @@ def shuffle(reader, buffer_size):

 def batch(reader, batch_size):
    """
-    This layer is a reader decorator. It takes a reader and adds 
-    'batching' decoration on it. When reading with the result 
-    decorated reader, output data will be automatically organized 
+    This layer is a reader decorator. It takes a reader and adds
+    'batching' decoration on it. When reading with the result
+    decorated reader, output data will be automatically organized
    to the form of batches.

    Args:
@@ -852,11 +852,11 @@ def batch(reader, batch_size):
            # If we read data with the raw_reader:
            #     data = fluid.layers.read_file(raw_reader)
            # We can only get data instance by instance.
-            # 
+            #
            # However, if we read data with the batch_reader:
            #     data = fluid.layers.read_file(batch_reader)
-            # Each 5 adjacent instances will be automatically combined together 
-            # to become a batch. So what we get('data') is a batch data instead 
+            # Each 5 adjacent instances will be automatically combined together
+            # to become a batch. So what we get('data') is a batch data instead
            # of an instance.
    """
    return __create_unshared_decorated_reader__(
@@ -903,8 +903,8 @@ def read_file(reader):
    """
    Execute the given reader and get data via it.

-    A reader is also a Variable. It can be a raw reader generated by 
-    `fluid.layers.open_files()` or a decorated one generated by 
+    A reader is also a Variable. It can be a raw reader generated by
+    `fluid.layers.open_files()` or a decorated one generated by
    `fluid.layers.double_buffer()` and so on.

    Args:
@@ -1005,7 +1005,7 @@ class Preprocessor(object):
        source_lod_levels = self.underlying_reader.desc.lod_levels()
        self.source_var_names = [
            unique_name("preprocessor_source")
-            for _ in xrange(len(source_shapes))
+            for _ in range(len(source_shapes))
        ]
        source_vars = []
        for var_name, shape, dtype, lod_level in zip(

--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
-import cStringIO
 import functools
 import warnings
 import string

+from six.moves import cStringIO
 from ..proto import framework_pb2
 from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
@@ -70,7 +70,7 @@ def _generate_doc_string_(op_proto):
    if not isinstance(op_proto, framework_pb2.OpProto):
        raise TypeError("OpProto should be `framework_pb2.OpProto`")

-    buf = cStringIO.StringIO()
+    buf = cStringIO()
    buf.write(escape_math(op_proto.comment))
    buf.write('\nArgs:\n')
    for each_input in op_proto.inputs:
@@ -119,9 +119,9 @@ def generate_layer_fn(op_type):
    """
    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
    not_intermediate_outputs = \
-        filter(lambda output: not output.intermediate, op_proto.outputs)
+        [output for output in op_proto.outputs if not output.intermediate]
    intermediate_outputs = \
-        filter(lambda output: output.intermediate, op_proto.outputs)
+        [output for output in op_proto.outputs if output.intermediate]

    if len(not_intermediate_outputs) != 1:
        raise ValueError("Only one non intermediate output operator can be",

--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -20,10 +20,10 @@ User can also implement their own learning_rate_decay
 strategy according to this module.
 """

-import control_flow
-import nn
-import ops
-import tensor
+from . import control_flow
+from . import nn
+from . import ops
+from . import tensor
 from ..initializer import init_on_cpu
 from ..framework import default_main_program, Parameter


--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -13,7 +13,7 @@
 # limitations under the License.

 from ..framework import Variable, unique_name
-from layer_function_generator import OpProtoHolder
+from .layer_function_generator import OpProtoHolder
 from ..initializer import force_init_on_cpu



--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -20,7 +20,7 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-import nn
+from . import nn

 __all__ = ['accuracy', 'auc']


--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -33,11 +33,12 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc, templatedoc
-from tensor import concat
-import utils
+from .layer_function_generator import autodoc, templatedoc
+from .tensor import concat
+from . import utils
 import random
 from .. import unique_name
+from functools import reduce

 __all__ = [
    'fc',
@@ -4849,7 +4850,7 @@ def dice_loss(input, label, epsilon=0.00001):
            loss = fluid.layers.dice_loss(input=predictions, label=label, 2)
    """
    label = one_hot(label, depth=input.shape[-1])
-    reduce_dim = range(1, len(input.shape))
+    reduce_dim = list(range(1, len(input.shape)))
    inse = reduce_sum(input * label, dim=reduce_dim)
    dice_denominator = reduce_sum(
        input, dim=reduce_dim) + reduce_sum(

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from layer_function_generator import generate_layer_fn
+from .layer_function_generator import generate_layer_fn

 __activations__ = [
    'sigmoid',

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -18,7 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
-from layer_function_generator import templatedoc
+from .layer_function_generator import templatedoc
 import numpy

 __all__ = [

--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import core
+from . import core
 import numpy as np

 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
@@ -24,7 +24,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):

    Create a lod tensor by doing the following:

-    1. Check that the length-based level of detail (LoD) also known as 
+    1. Check that the length-based level of detail (LoD) also known as
       recursive_sequence_lengths of the input is valid.

    2. Convert recursive_sequence_lengths to a offset-based LoD.
@@ -33,7 +33,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
       CPU or GPU device (based on input place).

    4. Set the level of detail (LoD) using the offset-based LoD.
-    
+
    Examples:

        Suppose we want LoDTensor to hold data for sequences of word, where each
@@ -51,7 +51,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
    Args:
        data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
            list holding the data to be copied.
-        recursive_seq_lens(list): a list of lists indicating the length-based level of detail 
+        recursive_seq_lens(list): a list of lists indicating the length-based level of detail
            info specified by the user.
        place(Place): CPU or GPU place indicating where the data in the new
            LoDTensor will be stored.
@@ -62,10 +62,10 @@ def create_lod_tensor(data, recursive_seq_lens, place):
    if isinstance(data, core.LoDTensor):
        return create_lod_tensor(np.array(data), recursive_seq_lens, place)
    elif isinstance(data, list):
-        # When input data is a list, it only deal with the case where the base element 
-        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
-        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
-        # of words or other indexes in the sequence. 
+        # When input data is a list, it only deal with the case where the base element
+        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated
+        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number
+        # of words or other indexes in the sequence.
        new_recursive_seq_lens = []
        for seq in data:
            new_recursive_seq_lens.append(len(seq))
@@ -109,12 +109,12 @@ def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
    Suppose we want LoDTensor to hold data for sequences of word, where each
    word is represented by an integer. If we want to create a LoDTensor to
    represent two sentences, one of 2 words, and one of 3 words. Then
-    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]]. 
-    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words 
+    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]].
+    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words
    for two sentences.

    Args:
-        recursive_seq_lens(list): a list of lists indicating the length-based 
+        recursive_seq_lens(list): a list of lists indicating the length-based
            level of detail info specified by the user.
        base_shape(list): the shape of the basic element to be held by the
            LoDTensor.
@@ -124,11 +124,11 @@ def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
        high(int): the upper bound of the random integers.

    Returns:
-        A fluid LoDTensor object with tensor data and recursive_seq_lens info. 
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
    """
    assert isinstance(base_shape, list), "base_shape should be a list"
    # append the total number of basic elements to the front of its shape
    overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
-    # the range of integer data elements is [low, high]    
+    # the range of integer data elements is [low, high]
    data = np.random.random_integers(low, high, overall_shape).astype("int64")
    return create_lod_tensor(data, recursive_seq_lens, place)
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -79,10 +79,10 @@ class MetricBase(object):
        """
        states = {
            attr: value
-            for attr, value in self.__dict__.iteritems()
+            for attr, value in list(self.__dict__.items())
            if not attr.startswith("_")
        }
-        for attr, value in states.iteritems():
+        for attr, value in list(states.items()):
            if isinstance(value, int):
                setattr(self, attr, 0)
            elif isinstance(value, float):
@@ -105,7 +105,7 @@ class MetricBase(object):
        """
        states = {
            attr: value
-            for attr, value in self.__dict__.iteritems()
+            for attr, value in list(self.__dict__.items())
            if not attr.startswith("_")
        }
        config = {}

--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)

 try:
-    from graphviz import Digraph
+    from .graphviz import Digraph
 except ImportError:
    logger.info(
        'Cannot import graphviz, which is required for drawing a network. This '
@@ -77,7 +77,7 @@ def parse_graph(program, graph, var_dict, **kwargs):
    # fill the known variables
    for block in program.blocks:
        for var in block.vars:
-            if not var_dict.has_key(var):
+            if var not in var_dict:
                var_dict[var] = "Feed"

    temp_id = 0
@@ -93,17 +93,17 @@ def parse_graph(program, graph, var_dict, **kwargs):
                    var_dict[arg] = op.type
            for e in op.inputs:
                for arg in e.arguments:
-                    if var_dict.has_key(arg):
+                    if arg in var_dict:
                        graph.edge(**draw_edge(var_dict, op, e, arg))
        break  # only plot the first block


 def draw_graph(startup_program, main_program, **kwargs):
-    if kwargs.has_key("graph_attr"):
+    if "graph_attr" in kwargs:
        GRAPH_STYLE.update(kwargs[graph_attr])
-    if kwargs.has_key("node_attr"):
+    if "node_attr" in kwargs:
        OP_STYLE.update(kwargs[node_attr])
-    if kwargs.has_key("edge_attr"):
+    if "edge_attr" in kwargs:
        VAR_STYLE.update(kwargs[edge_attr])

    graph_id = unique_id()

--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import layers
+from . import layers

 __all__ = [
    "simple_img_conv_pool",
@@ -210,7 +210,7 @@ def img_conv_group(input,
    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)

-    for i in xrange(len(conv_num_filter)):
+    for i in range(len(conv_num_filter)):
        local_conv_act = conv_act
        if conv_with_batchnorm[i]:
            local_conv_act = None
@@ -488,10 +488,11 @@ def scaled_dot_product_attention(queries,
        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
        return layers.reshape(
            x=trans_x,
-            shape=map(int, [
-                trans_x.shape[0], trans_x.shape[1],
-                trans_x.shape[2] * trans_x.shape[3]
-            ]))
+            shape=list(
+                map(int, [
+                    trans_x.shape[0], trans_x.shape[1], trans_x.shape[2] *
+                    trans_x.shape[3]
+                ])))

    q, k, v = __compute_qkv(queries, keys, values, num_heads)


--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import six
+
 import paddle.fluid.core as core
 import paddle.fluid.proto.framework_pb2 as framework_pb2

@@ -24,13 +26,13 @@ def get_all_op_protos():
    protostrs = core.get_all_op_protos()
    ret_values = []
    for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
        ret_values.append(op_proto)
    return ret_values


 def is_str(s):
-    return isinstance(s, str) or isinstance(s, unicode)
+    return isinstance(s, six.string_types)


 class OpDescCreationMethod(object):
@@ -189,7 +191,7 @@ class OperatorFactory(object):
        return self.get_op_info(t).method(**kwargs)

    def types(self):
-        return self.op_methods.keys()
+        return list(self.op_methods.keys())

    def get_op_info(self, t):
        if t not in self.op_methods:
@@ -197,13 +199,13 @@ class OperatorFactory(object):
        return self.op_methods.get(t)

    def get_op_input_names(self, type):
-        return map(lambda x: x[0], self.get_op_info(type).inputs)
+        return [x[0] for x in self.get_op_info(type).inputs]

    def get_op_inputs(self, type):
        return self.get_op_info(type).inputs

    def get_op_output_names(self, type):
-        return map(lambda x: x[0], self.get_op_info(type).outputs)
+        return [x[0] for x in self.get_op_info(type).outputs]

    def get_op_outputs(self, type):
        return self.get_op_info(type).outputs

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -14,15 +14,15 @@
 import re
 from collections import defaultdict
 from paddle.fluid.framework import Program, Variable
-import framework
-import layers
-from backward import append_backward
-from framework import program_guard
-import unique_name
-from initializer import Constant
-from layer_helper import LayerHelper
-from regularizer import append_regularization_ops
-from clip import append_gradient_clip_ops, error_clip_callback
+from . import framework
+from . import layers
+from .backward import append_backward
+from .framework import program_guard
+from . import unique_name
+from .initializer import Constant
+from .layer_helper import LayerHelper
+from .regularizer import append_regularization_ops
+from .clip import append_gradient_clip_ops, error_clip_callback
 from contextlib import contextmanager

 __all__ = [

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import core
+from __future__ import print_function
 import multiprocessing
-import framework
-import executor
+from . import core
+from . import framework
+from . import executor
 import warnings
 import sys
 import os
@@ -94,7 +95,7 @@ class ParallelExecutor(object):
        self._places = []
        self._act_places = []
        if use_cuda:
-            for i in xrange(core.get_cuda_device_count()):
+            for i in range(core.get_cuda_device_count()):
                p = core.Place()
                self._act_places.append(core.CUDAPlace(i))
                p.set_place(self._act_places[-1])
@@ -102,7 +103,7 @@ class ParallelExecutor(object):
        else:
            cpu_num = int(
                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            for i in xrange(cpu_num):
+            for i in range(cpu_num):
                p = core.Place()
                self._act_places.append(core.CPUPlace())
                p.set_place(self._act_places[-1])
@@ -143,16 +144,16 @@ class ParallelExecutor(object):
        ) if share_vars_from else []

        self.persistable_vars = [
-            v.name
-            for v in filter(
-                lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
-                main.list_vars())
+            v.name for v in [
+                var for var in main.list_vars()
+                if var.persistable and var.type != core.VarDesc.VarType.RAW
+            ]
        ]

        self.executor = core.ParallelExecutor(
            self._places,
            set([
-                p.name for p in main.global_block()._iter_parameters()
+                p.name for p in main.global_block().iter_parameters()
                if not p.stop_gradient
            ]),
            set(self.persistable_vars), main.desc, loss_name
@@ -227,7 +228,9 @@ class ParallelExecutor(object):
        """
        if feed is None and feed_dict is not None:
            feed = feed_dict
-            print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
+            print(
+                "`feed_dict` is deprecated. Please use `feed=`",
+                file=sys.stderr)

        if isinstance(feed, dict):
            feed_tensor_dict = dict()

--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from initializer import Initializer, Xavier, Constant
-from regularizer import WeightDecayRegularizer
+import six
+
+from .initializer import Initializer, Xavier, Constant
+from .regularizer import WeightDecayRegularizer

 __all__ = [
    'ParamAttr',
@@ -134,7 +136,7 @@ class ParamAttr(object):
            return [ParamAttr._to_attr(a) for a in arg]
        elif isinstance(arg, ParamAttr):
            return arg
-        elif isinstance(arg, str) or isinstance(arg, unicode):
+        elif isinstance(arg, six.string_types):
            return ParamAttr(name=arg)
        elif isinstance(arg, Initializer):
            return ParamAttr(initializer=arg)

--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import core
+from . import core
 from contextlib import contextmanager
 import os

@@ -224,7 +224,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):

    If the state == 'All', a profile proto file will be written to
    `profile_path`. This file records timeline information during the execution.
-    Then users can visualize this file to see the timeline, please refer 
+    Then users can visualize this file to see the timeline, please refer
    https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md

    Args:

--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -13,8 +13,8 @@
 # limitations under the License.

 import os
-import core
 import contextlib
+from . import core
 __all__ = [
    'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
 ]

--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import framework
+from . import framework
 from . import core

 __all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']

--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -63,7 +63,7 @@ def train(use_cuda, train_program, params_dirname):
            if event.step == 10:
                test_metrics = trainer.test(
                    reader=test_reader, feed_order=['x', 'y'])
-                print test_metrics
+                print(test_metrics)
                '''
                ...
                ['25.768919467926025']

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
@@ -28,11 +28,12 @@ images per class.

 """

-import cPickle
 import itertools
 import numpy
 import paddle.v2.dataset.common
 import tarfile
+from six.moves import cPickle as pickle
+from six.moves import zip

 __all__ = ['train10']

@@ -46,7 +47,7 @@ def reader_creator(filename, sub_name, batch_size=None):
        data = batch['data']
        labels = batch.get('labels', batch.get('fine_labels', None))
        assert labels is not None
-        for sample, label in itertools.izip(data, labels):
+        for sample, label in zip(data, labels):
            yield (sample / 255.0).astype(numpy.float32), int(label)

    def reader():
@@ -56,7 +57,7 @@ def reader_creator(filename, sub_name, batch_size=None):

            batch_count = 0
            for name in names:
-                batch = cPickle.load(f.extractfile(name))
+                batch = pickle.load(f.extractfile(name))
                for item in read_batch(batch):
                    if isinstance(batch_size, int) and batch_count > batch_size:
                        break

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import numpy

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import numpy

--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import numpy as np
@@ -178,14 +176,15 @@ def train(use_cuda, train_program, params_dirname):
            if float(avg_cost) < 100.0:  # Large value to increase CI speed
                trainer.save_params(params_dirname)
            else:
-                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
-                                                              float(avg_cost)))
+                print(
+                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                             float(avg_cost))))
                if math.isnan(float(avg_cost)):
                    sys.exit("got NaN loss, training failed.")

        elif isinstance(event, fluid.EndStepEvent):
            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
            if event.step == 1:  # Run 2 iterations to speed CI
                trainer.save_params(params_dirname)
                trainer.stop()
@@ -207,14 +206,14 @@ def infer(use_cuda, inference_program, params_dirname):
        inference_program, param_path=params_dirname, place=place)

    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
    # look up for the corresponding word vector.
    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
    # Note that recursive_sequence_lengths should be a list of lists.
    recursive_seq_lens = [[3, 4, 2]]
    base_shape = [1]

--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -250,7 +250,7 @@ def decode_main(use_cuda, is_sparse):
    feeder = fluid.DataFeeder(feed_list, place)

    for data in train_data():
-        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict = feeder.feed([[x[0]] for x in data])
        feed_dict['init_ids'] = init_ids
        feed_dict['init_scores'] = init_scores

@@ -259,7 +259,7 @@ def decode_main(use_cuda, is_sparse):
            feed=feed_dict,
            fetch_list=[translation_ids, translation_scores],
            return_numpy=False)
-        print result_ids.recursive_sequence_lengths()
+        print(result_ids.recursive_sequence_lengths())
        break



--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
+
 import argparse
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -89,8 +89,10 @@ def train(use_cuda, train_program, params_dirname):
                if math.isnan(avg_cost):
                    sys.exit("got NaN loss, training failed.")
        elif isinstance(event, fluid.EndStepEvent):
-            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(numpy.array, event.metrics)))
+            print(
+                ("Step {0}, Epoch {1} Metrics {2}".format(
+                    event.step, event.epoch,
+                    list(map(numpy.array, event.metrics)))))

    train_reader = paddle.batch(
        paddle.reader.shuffle(

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
+
 import argparse
 import paddle.fluid as fluid
 import paddle

--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -186,8 +186,9 @@ def train(use_cuda, train_program, params_dirname):
                trainer.save_params(params_dirname)
                trainer.stop()
            else:
-                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
-                                                              float(avg_cost)))
+                print(
+                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                             float(avg_cost))))
                if math.isnan(float(avg_cost)):
                    sys.exit("got NaN loss, training failed.")


--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -98,7 +96,7 @@ def train(use_cuda, train_program, params_dirname):
                    sys.exit("got NaN loss, training failed.")
        elif isinstance(event, fluid.EndStepEvent):
            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
            if event.step == 1:  # Run 2 iterations to speed CI
                trainer.save_params(params_dirname)
                trainer.stop()
@@ -125,14 +123,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
        place=place)

    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
    # look up for the corresponding word vector.
    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
    # Note that recursive_sequence_lengths should be a list of lists.
    recursive_seq_lens = [[3, 4, 2]]
    base_shape = [1]

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -113,7 +111,7 @@ def train(use_cuda, train_program, params_dirname):
                    sys.exit("got NaN loss, training failed.")
        elif isinstance(event, fluid.EndStepEvent):
            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
            if event.step == 1:  # Run 2 iterations to speed CI
                trainer.save_params(params_dirname)
                trainer.stop()
@@ -140,14 +138,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
        place=place)

    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
    # look up for the corresponding word vector.
    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
    # Note that recursive_sequence_lengths should be a list of lists.
    recursive_seq_lens = [[3, 4, 2]]
    base_shape = [1]

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -107,7 +105,7 @@ def train(use_cuda, train_program, params_dirname):
                    sys.exit("got NaN loss, training failed.")
        elif isinstance(event, fluid.EndStepEvent):
            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
            if event.step == 1:  # Run 2 iterations to speed CI
                trainer.save_params(params_dirname)
                trainer.stop()
@@ -135,14 +133,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
        place=place)

    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # Here each word is the basic element of the LoDTensor and the shape of
+    # each word (base_shape) should be [1] since it is simply an index to
    # look up for the corresponding word vector.
    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
-    # one higher level structure (sequence of words, or sentence) than the basic 
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
-    # length 3, 4 and 2, respectively. 
+    # which has only one level of detail. Then the created LoDTensor will have only
+    # one higher level structure (sequence of words, or sentence) than the basic
+    # element (word). Hence the LoDTensor will hold data for three sentences of
+    # length 3, 4 and 2, respectively.
    # Note that recursive_sequence_lengths should be a list of lists.
    recursive_seq_lens = [[3, 4, 2]]
    base_shape = [1]

--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
+
 from paddle.fluid.layers.device import get_places
 import unittest
 import paddle.fluid as fluid
@@ -175,7 +175,7 @@ def train(word_dict,
    def train_loop(main_program):
        exe.run(fluid.default_startup_program())

-        for pass_id in xrange(PASS_NUM):
+        for pass_id in range(PASS_NUM):
            for data in train_data():
                cost_val, acc_val = exe.run(main_program,
                                            feed=feeder.feed(data),
@@ -235,14 +235,14 @@ def infer(word_dict, use_cuda, save_dirname=None):
        word_dict_len = len(word_dict)

        # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
        # look up for the corresponding word vector.
        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for three sentences of 
-        # length 3, 4 and 2, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for three sentences of
+        # length 3, 4 and 2, respectively.
        # Note that recursive_sequence_lengths should be a list of lists.
        recursive_seq_lens = [[3, 4, 2]]
        base_shape = [1]

--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -114,7 +114,7 @@ def infer(use_cuda, save_dirname=None):
        test_reader = paddle.batch(
            paddle.dataset.uci_housing.test(), batch_size=batch_size)

-        test_data = test_reader().next()
+        test_data = next(test_reader())
        test_feat = numpy.array(
            [data[0] for data in test_data]).astype("float32")
        test_label = numpy.array(

--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from __future__ import print_function
-
 import paddle
 import paddle.fluid as fluid
 import contextlib
@@ -121,7 +119,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
    avg_cost = fluid.layers.mean(cost)
    acc = fluid.layers.accuracy(input=predict, label=label)

-    # Test program 
+    # Test program
    test_program = fluid.default_main_program().clone(for_test=True)

    optimizer = fluid.optimizer.Adam(learning_rate=0.001)

--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -181,7 +181,7 @@ def train(use_cuda, save_dirname=None, is_local=True):

        start_time = time.time()
        batch_id = 0
-        for pass_id in xrange(PASS_NUM):
+        for pass_id in range(PASS_NUM):
            for data in train_data():
                cost = exe.run(main_program,
                               feed=feeder.feed(data),
@@ -248,14 +248,14 @@ def infer(use_cuda, save_dirname=None):
         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)

        # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
        # look up for the corresponding word vector.
        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for three sentences of 
-        # length 3, 4 and 2, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for three sentences of
+        # length 3, 4 and 2, respectively.
        # Note that recursive_sequence_lengths should be a list of lists.
        recursive_seq_lens = [[3, 4, 2]]
        base_shape = [1]

--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -199,7 +199,7 @@ def train_main(use_cuda, is_sparse, is_local=True):
        feeder = fluid.DataFeeder(feed_list, place)

        batch_id = 0
-        for pass_id in xrange(1):
+        for pass_id in range(1):
            for data in train_data():
                outs = exe.run(main_program,
                               feed=feeder.feed(data),
@@ -273,7 +273,7 @@ def decode_main(use_cuda, is_sparse):
    feeder = fluid.DataFeeder(feed_list, place)

    for data in train_data():
-        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict = feeder.feed([[x[0]] for x in data])
        feed_dict['init_ids'] = init_ids
        feed_dict['init_scores'] = init_scores

@@ -282,7 +282,7 @@ def decode_main(use_cuda, is_sparse):
            feed=feed_dict,
            fetch_list=[translation_ids, translation_scores],
            return_numpy=False)
-        print result_ids.recursive_sequence_lengths()
+        print(result_ids.recursive_sequence_lengths())
        break



--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function

 import paddle.fluid.core as core
 import math

--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -260,15 +260,15 @@ def infer(use_cuda, save_dirname=None):

        # Use the first data from paddle.dataset.movielens.test() as input
        assert feed_target_names[0] == "user_id"
-        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
-        # to generate LoD Tensor where `data` is a list of sequences of index 
-        # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API
+        # to generate LoD Tensor where `data` is a list of sequences of index
+        # numbers, `recursive_sequence_lengths` is the length-based level of detail
        # (lod) info associated with `data`.
        # For example, data = [[10, 2, 3], [2, 3]] means that it contains
        # two sequences of indexes, of length 3 and 2, respectively.
-        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
-        # level of detail info, indicating that `data` consists of two sequences 
-        # of length 3 and 2, respectively. 
+        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
+        # level of detail info, indicating that `data` consists of two sequences
+        # of length 3 and 2, respectively.
        user_id = fluid.create_lod_tensor([[1]], [[1]], place)

        assert feed_target_names[1] == "gender_id"

--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -175,7 +175,7 @@ def train(use_cuda, save_dirname=None):
    feeder = fluid.DataFeeder(feed_list, place)

    batch_id = 0
-    for pass_id in xrange(2):
+    for pass_id in range(2):
        for data in train_data():
            outs = exe.run(framework.default_main_program(),
                           feed=feeder.feed(data),
@@ -213,14 +213,14 @@ def infer(use_cuda, save_dirname=None):
         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)

        # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # Here each word is the basic element of the LoDTensor and the shape of
+        # each word (base_shape) should be [1] since it is simply an index to
        # look up for the corresponding word vector.
        # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
-        # one higher level structure (sequence of words, or sentence) than the basic 
-        # element (word). Hence the LoDTensor will hold data for two sentences of 
-        # length 4 and 6, respectively. 
+        # which has only one level of detail. Then the created LoDTensor will have only
+        # one higher level structure (sequence of words, or sentence) than the basic
+        # element (word). Hence the LoDTensor will hold data for two sentences of
+        # length 4 and 6, respectively.
        # Note that recursive_sequence_lengths should be a list of lists.
        recursive_seq_lens = [[4, 6]]
        base_shape = [1]

--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -85,9 +85,11 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            avg_cost, predict_word = __network__(
-                map(pd.read_input, [
-                    first_word, second_word, third_word, forth_word, next_word
-                ]))
+                list(
+                    map(pd.read_input, [
+                        first_word, second_word, third_word, forth_word,
+                        next_word
+                    ])))
            pd.write_output(avg_cost)

        avg_cost = fluid.layers.mean(pd())
@@ -167,11 +169,11 @@ def infer(use_cuda, save_dirname=None):
        word_dict = paddle.dataset.imikolov.build_dict()
        dict_size = len(word_dict)

-        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
-        # is simply an index to look up for the corresponding word vector and hence 
-        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
-        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
-        # meaning there is only one level of detail and there is only one sequence of 
+        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
+        # is simply an index to look up for the corresponding word vector and hence
+        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths,
+        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]]
+        # meaning there is only one level of detail and there is only one sequence of
        # one word on this level.
        # Note that recursive_sequence_lengths should be a list of lists.
        recursive_seq_lens = [[1]]

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -78,7 +78,7 @@ for pass_id in range(PASS_NUM):

        if avg_loss_value[0] < 10.0:
            exit(0)  # if avg cost less than 10.0, we think our code is good.
-        print avg_loss_value[0]
+        print(avg_loss_value[0])
        if math.isnan(float(avg_loss_value)):
            sys.exit("got NaN loss, training failed.")
 exit(1)
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from __future__ import print_function
-
 import sys

 import paddle

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -118,7 +118,7 @@ def main():
    feeder = fluid.DataFeeder(feed_list, place)

    batch_id = 0
-    for pass_id in xrange(10):
+    for pass_id in range(10):
        for data in train_data():
            outs = exe.run(fluid.default_main_program(),
                           feed=feeder.feed(data),

--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
@@ -137,7 +137,7 @@ def main():
            generated_img = exe.run(g_program,
                                    feed={'noise': n},
                                    fetch_list={g_img})[0]
-            real_data = numpy.array(map(lambda x: x[0], data)).astype('float32')
+            real_data = numpy.array([x[0] for x in data]).astype('float32')
            real_data = real_data.reshape(num_true, 784)
            total_data = numpy.concatenate([real_data, generated_img])
            total_label = numpy.concatenate([
@@ -150,7 +150,7 @@ def main():
                                feed={'img': total_data,
                                      'label': total_label},
                                fetch_list={d_loss})[0]
-            for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
+            for _ in range(NUM_TRAIN_TIMES_OF_DG):
                n = numpy.random.uniform(
                    low=-1.0, high=1.0,
                    size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(

--- a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -36,7 +36,7 @@ if len(sys.argv) == 1:
 else:
    word_dict = load_vocab(sys.argv[1])
    word_dict["<unk>"] = len(word_dict)
-print "Dict dim = ", len(word_dict)
+print("Dict dim = ", len(word_dict))

 # input text data
 data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1)

--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
@@ -194,7 +194,7 @@ class TestRoutineOp(unittest.TestCase):
            quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)

            with fluid.Go():
-                for i in xrange(10):
+                for i in range(10):
                    fluid.channel_recv(ch1, result)
                    Print(result)


--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -155,7 +155,7 @@ def train_main(use_cuda):
        ]
        feeder = fluid.DataFeeder(feed_list, place)

-        for pass_id in xrange(1):
+        for pass_id in range(1):
            for batch_id, data in enumerate(train_reader()):
                outs = exe.run(main_program,
                               feed=feeder.feed(data),
@@ -204,8 +204,8 @@ def decode_main(use_cuda):
    ]
    feeder = fluid.DataFeeder(feed_list, place)

-    data = train_reader().next()
-    feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+    data = next(train_reader())
+    feed_dict = feeder.feed([[x[0]] for x in data])
    feed_dict['init_ids'] = init_ids
    feed_dict['init_scores'] = init_scores

@@ -214,7 +214,7 @@ def decode_main(use_cuda):
        feed=feed_dict,
        fetch_list=[translation_ids, translation_scores],
        return_numpy=False)
-    print result_ids.lod()
+    print(result_ids.lod())


 class TestBeamSearchDecoder(unittest.TestCase):

--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from __future__ import print_function
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard

--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from __future__ import print_function
 import numpy as np
 import paddle
 import paddle.fluid as fluid

--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -76,15 +76,15 @@ class TestMNISTIfElseOp(unittest.TestCase):
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                x_data = np.array([x[0] for x in data]).astype("float32")
+                y_data = np.array([x[1] for x in data]).astype("int64")
                y_data = np.expand_dims(y_data, axis=1)

                outs = exe.run(prog,
                               feed={'x': x_data,
                                     'y': y_data},
                               fetch_list=[avg_loss])
-                print outs[0]
+                print(outs[0])
                if outs[0] < 1.0:
                    return
        self.assertFalse(True)
@@ -131,15 +131,15 @@ class TestMNISTIfElseOp(unittest.TestCase):
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                x_data = np.array([x[0] for x in data]).astype("float32")
+                y_data = np.array([x[1] for x in data]).astype("int64")
                y_data = y_data.reshape((y_data.shape[0], 1))

                outs = exe.run(prog,
                               feed={'x': x_data,
                                     'y': y_data},
                               fetch_list=[avg_loss])
-                print outs[0]
+                print(outs[0])
                if outs[0] < 1.0:
                    return
        self.assertFalse(True)

--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -16,6 +16,7 @@ import numpy as np
 import unittest
 import time
 import itertools
+import six

 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -40,8 +41,8 @@ class BenchmarkSuite(OpTest):
            expect_t = np.array(item_cpu_out)
            actual = item_gpu_out
            actual_t = np.array(item_gpu_out)
-            var_name = variable if isinstance(variable,
-                                              basestring) else variable.name
+            var_name = variable if isinstance(
+                variable, six.string_types) else variable.name
            self.assertTrue(
                np.allclose(
                    actual_t, expect_t, atol=atol),
@@ -53,7 +54,7 @@ class BenchmarkSuite(OpTest):

    def _get_input_names(self):
        inputs = []
-        for name, value in self.inputs.iteritems():
+        for name, value in list(self.inputs.items()):
            if isinstance(value, list):
                inputs.extend([sub_name for sub_name, _ in value])
            inputs.append(name)
@@ -61,7 +62,7 @@ class BenchmarkSuite(OpTest):

    def _get_output_names(self):
        outputs = []
-        for var_name, var in self.outputs.iteritems():
+        for var_name, var in list(self.outputs.items()):
            if isinstance(var, list):
                for sub_var_name, sub_var in var:
                    outputs.append(sub_var_name)

--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -14,6 +14,7 @@

 import numpy as np
 import argparse
+import six
 import time
 import math

@@ -299,7 +300,7 @@ class DistSeResneXt2x2:
            True, loss_name=avg_cost.name, exec_strategy=strategy)

        feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
+            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

@@ -311,7 +312,7 @@ class DistSeResneXt2x2:
                              feed=feeder.feed(data))
        print(first_loss)

-        for i in xrange(5):
+        for i in six.moves.xrange(5):
            data = next(reader_generator)
            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))


--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -26,13 +26,15 @@ from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, OpProtoHolder, Variable
 from testsuite import create_op, set_input, append_input_output, append_loss_ops
+from functools import reduce
+from six.moves import zip


 def randomize_probability(batch_size, class_num, dtype='float32'):
    prob = np.random.uniform(
        0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
    prob_sum = prob.sum(axis=1)
-    for i in xrange(len(prob)):
+    for i in range(len(prob)):
        prob[i] /= prob_sum[i]
    return prob

@@ -101,7 +103,7 @@ def get_numeric_gradient(place,

    # we only compute gradient of one element each time.
    # we use a for loop to compute the gradient of every element.
-    for i in xrange(tensor_size):
+    for i in range(tensor_size):
        if in_place:
            set_input(scope, op, inputs, place)

@@ -159,7 +161,7 @@ class OpTest(unittest.TestCase):
            assert isinstance(
                numpy_dict,
                dict), "self.inputs, self.outputs must be numpy_dict"
-            for var_name, var_value in numpy_dict.iteritems():
+            for var_name, var_value in numpy_dict.items():
                if isinstance(var_value, (np.ndarray, np.generic)):
                    self.try_call_once(var_value.dtype)
                elif isinstance(var_value, (list, tuple)):
@@ -223,7 +225,7 @@ class OpTest(unittest.TestCase):

    def _get_io_vars(self, block, numpy_inputs):
        inputs = {}
-        for name, value in numpy_inputs.iteritems():
+        for name, value in numpy_inputs.items():
            if isinstance(value, list):
                var_list = [
                    block.var(sub_name) for sub_name, sub_value in value
@@ -266,7 +268,7 @@ class OpTest(unittest.TestCase):
        # if the fetch_list is customized by user, we use it directly.
        # if not, fill the fetch_list by the user configured outputs in test.
        if len(fetch_list) == 0:
-            for var_name, var in outputs.iteritems():
+            for var_name, var in outputs.items():
                if isinstance(var, list):
                    for v in var:
                        fetch_list.append(v)
@@ -278,7 +280,7 @@ class OpTest(unittest.TestCase):
                fetch_list.append(str(out_name))
        # fetch_list = map(block.var, fetch_list)
        if not isinstance(fetch_list[0], fluid.framework.Variable):
-            fetch_list = map(block.var, fetch_list)
+            fetch_list = list(map(block.var, fetch_list))
        outs = executor.run(program,
                            feed=feed_map,
                            fetch_list=fetch_list,
@@ -369,7 +371,7 @@ class OpTest(unittest.TestCase):
    def __assert_is_close(self, numeric_grads, analytic_grads, names,
                          max_relative_error, msg_prefix):

-        for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
+        for a, b, name in zip(numeric_grads, analytic_grads, names):
            abs_a = np.abs(a)
            abs_a[abs_a < 1e-3] = 1

@@ -510,6 +512,6 @@ class OpTest(unittest.TestCase):
                use_cuda=use_cuda, loss_name=loss.name, main_program=prog)
        else:
            executor = Executor(place)
-        return map(np.array,
-                   executor.run(prog, feed_dict, fetch_list,
-                                return_numpy=False))
+        return list(
+            map(np.array,
+                executor.run(prog, feed_dict, fetch_list, return_numpy=False)))
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -91,7 +91,7 @@ class TestParallelExecutorBase(unittest.TestCase):
            first_loss, = run_executor(
                exe=exe, feed=feed_dict, fetch_list=[loss.name])

-            for i in xrange(iter):
+            for i in range(iter):
                run_executor(exe=exe, feed=feed_dict, fetch_list=[])

            last_loss, = run_executor(
@@ -99,8 +99,8 @@ class TestParallelExecutorBase(unittest.TestCase):
            end = time.time()

            if batch_size is not None:
-                print "%.4f Instance per second" % (
-                    (batch_size * iter + 2) / (end - begin))
+                print("%.4f Instance per second" % (
+                    (batch_size * iter + 2) / (end - begin)))

            avg_last_loss_val = np.array(last_loss).mean()
            avg_first_loss_val = np.array(first_loss).mean()
@@ -108,6 +108,6 @@ class TestParallelExecutorBase(unittest.TestCase):
                    float(avg_first_loss_val)):
                sys.exit("got NaN loss, training failed.")

-            print first_loss, last_loss
+            print(first_loss, last_loss)
            # self.assertGreater(first_loss[0], last_loss[0])
            return first_loss, last_loss
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -26,7 +26,7 @@ class TestAccuracyOp(OpTest):
        label = np.random.randint(0, 2, (n, 1))
        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
        num_correct = 0
-        for rowid in xrange(n):
+        for rowid in range(n):
            for ele in indices[rowid]:
                if ele == label[rowid]:
                    num_correct += 1

--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -273,7 +273,7 @@ class TestSparseAdamOp(unittest.TestCase):
        self.setup(scope, place)

        op_args = dict()
-        for key, np_array in self.dense_inputs.iteritems():
+        for key, np_array in self.dense_inputs.items():
            var = scope.var(key).get_tensor()
            var.set(np_array, place)
            op_args[key] = key
@@ -290,7 +290,7 @@ class TestSparseAdamOp(unittest.TestCase):
        adam_op = Operator("adam", **op_args)
        adam_op.run(scope, place)

-        for key, np_array in self.outputs.iteritems():
+        for key, np_array in self.outputs.items():
            out_var = scope.var(key).get_tensor()
            actual = np.array(out_var)
            actual = actual.reshape([actual.size])

--- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
@@ -80,8 +80,9 @@ class TestArrayReadWrite(unittest.TestCase):

        append_backward(total_sum_scaled)

-        g_vars = map(default_main_program().global_block().var,
-                     [each_x.name + "@GRAD" for each_x in x])
+        g_vars = list(
+            map(default_main_program().global_block().var,
+                [each_x.name + "@GRAD" for each_x in x]))
        g_out = [
            item.sum()
            for item in exe.run(

--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -415,7 +415,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
            self.__assert_close(scale_grad, out[6], "scale_grad")
            self.__assert_close(bias_grad, out[7], "bias_grad")

-            print "op test forward passed: ", str(place), data_layout
+            print("op test forward passed: ", str(place), data_layout)

        places = [core.CPUPlace()]


--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -59,8 +59,7 @@ class BeamSearchOpTester(unittest.TestCase):
            np.allclose(
                np.array(selected_scores),
                np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
-        self.assertEqual(selected_ids.lod(),
-                         [[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]])
+        self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])

    def _create_pre_ids(self):
        np_data = np.array([[1, 2, 3, 4]], dtype='int64')

--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -48,7 +48,7 @@ def bipartite_match(distance, match_indices, match_dist):

 def argmax_match(distance, match_indices, match_dist, threshold):
    r, c = distance.shape
-    for j in xrange(c):
+    for j in range(c):
        if match_indices[j] != -1:
            continue
        col_dist = distance[:, j]

--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
@@ -63,7 +63,7 @@ class TestChunkEvalOp(OpTest):
        # generate chunk beginnings
        chunk_begins = sorted(
            np.random.choice(
-                range(starts[-1]), num_chunks, replace=False))
+                list(range(starts[-1])), num_chunks, replace=False))
        seq_chunk_begins = []
        begin_idx = 0
        # divide chunks into sequences
@@ -93,7 +93,7 @@ class TestChunkEvalOp(OpTest):
                                  self.num_infer_chunks + self.num_label_chunks
                                  - self.num_correct_chunks)
        correct_chunks = np.random.choice(
-            range(len(chunks)), self.num_correct_chunks, replace=False)
+            list(range(len(chunks))), self.num_correct_chunks, replace=False)
        infer_chunks = np.random.choice(
            [x for x in range(len(chunks)) if x not in correct_chunks],
            self.num_infer_chunks - self.num_correct_chunks,
@@ -138,7 +138,8 @@ class TestChunkEvalOp(OpTest):
        infer.fill(self.num_chunk_types * self.num_tag_types)
        label = np.copy(infer)
        starts = np.random.choice(
-            range(1, self.batch_size), self.num_sequences - 1,
+            list(range(1, self.batch_size)),
+            self.num_sequences - 1,
            replace=False).tolist()
        starts.extend([0, self.batch_size])
        starts = sorted(starts)

--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
@@ -39,7 +39,7 @@ class ConditionalBlockTest(unittest.TestCase):
        x = numpy.random.random(size=(10, 1)).astype('float32')

        outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
-        print outs
+        print(outs)
        loss = layers.mean(out)
        append_backward(loss=loss)
        outs = exe.run(
@@ -47,7 +47,7 @@ class ConditionalBlockTest(unittest.TestCase):
            fetch_list=[
                default_main_program().block(0).var(data.name + "@GRAD")
            ])[0]
-        print outs
+        print(outs)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
@@ -22,8 +22,8 @@ def conv_shift_forward(x, y):
    M = x.shape[1]
    N = y.shape[1]
    y_half_width = (N - 1) / 2
-    for i in xrange(M):
-        for j in xrange(N):
+    for i in range(M):
+        for j in range(N):
            out[:, i] += x[:, (i + j + M - y_half_width) % M] * y[:, j]
    return out


--- a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
+++ b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
@@ -18,7 +18,7 @@ import paddle.fluid.layers as layers

 class TestDocString(unittest.TestCase):
    def test_layer_doc_string(self):
-        print layers.dropout.__doc__
+        print(layers.dropout.__doc__)


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -21,7 +21,7 @@ import numpy as np
 class TestDataBalance(unittest.TestCase):
    def prepare_data(self):
        def fake_data_generator():
-            for n in xrange(self.total_ins_num):
+            for n in range(self.total_ins_num):
                yield np.ones((3, 4)) * n, n

        # Prepare data
@@ -41,7 +41,7 @@ class TestDataBalance(unittest.TestCase):

    def prepare_lod_data(self):
        def fake_data_generator():
-            for n in xrange(1, self.total_ins_num + 1):
+            for n in range(1, self.total_ins_num + 1):
                d1 = (np.ones((n, 3)) * n).astype('float32')
                d2 = (np.array(n).reshape((1, 1))).astype('int32')
                yield d1, d2
@@ -58,9 +58,9 @@ class TestDataBalance(unittest.TestCase):
                            (0, 1))
                    ]
                    lod = [0]
-                    for _ in xrange(self.batch_size):
+                    for _ in range(self.batch_size):
                        try:
-                            ins = generator.next()
+                            ins = next(generator)
                        except StopIteration:
                            eof = True
                            break

--- a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
+++ b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
@@ -39,7 +39,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):
            self.assertTrue(i.is_int())
            self.assertEqual(10, i.get_int())

-        for _ in xrange(10):
+        for _ in range(10):
            scoped_function(__new_scope__)



--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -176,7 +176,7 @@ class TestDetectionMAPOp(OpTest):
            true_pos[label].append([score, tp])
            false_pos[label].append([score, fp])

-        for (label, label_pos_num) in label_count.items():
+        for (label, label_pos_num) in list(label_count.items()):
            if label_pos_num == 0 or label not in true_pos: continue
            label_true_pos = true_pos[label]
            label_false_pos = false_pos[label]

--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -25,6 +25,7 @@ import unittest
 from multiprocessing import Process
 import os
 import signal
+from functools import reduce

 SEED = 1
 DTYPE = "float32"
@@ -172,12 +173,12 @@ class TestDistMnist(unittest.TestCase):
        exe.run(fluid.default_startup_program())

        feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
+            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
-        for pass_id in xrange(10):
+        for pass_id in range(10):
            for batch_id, data in enumerate(train_reader()):
                exe.run(trainer_prog, feed=feeder.feed(data))


--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -161,7 +161,7 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest):
                         ["fill_constant", "fill_constant"])
        # the variable #fc_w will be split into two blocks
        fc_w_var = startup2.global_block().var("fc_w")
-        self.assertEqual(fc_w_var.shape, (1000L, 1000L))
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
        # all parameters should be optimized on pserver

        pserver_params = []
@@ -194,9 +194,9 @@ class TestNoSliceVar(TranspilerTest):
        _, startup = self.get_pserver(self.pserver1_ep, config)
        _, startup2 = self.get_pserver(self.pserver2_ep, config)

-        if startup.global_block().vars.has_key("fc_w"):
+        if "fc_w" in startup.global_block().vars:
            fc_w_var = startup.global_block().vars["fc_w"]
-        elif startup2.global_block().vars.has_key("fc_w"):
+        elif "fc_w" in startup2.global_block().vars:
            fc_w_var = startup2.global_block().vars["fc_w"]

        self.assertEqual(fc_w_var.shape, (1000, 1000))

--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -183,12 +183,12 @@ class TestDistMnist(unittest.TestCase):
            exec_strategy=exec_strategy)

        feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
+            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
-        for pass_id in xrange(10):
+        for pass_id in range(10):
            for batch_id, data in enumerate(train_reader()):
                avg_loss_np = train_exe.run(feed=feeder.feed(data),
                                            fetch_list=[avg_cost.name])

--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -135,7 +135,7 @@ class TestDynRNN(unittest.TestCase):
        loss_0 = exe.run(main_program,
                         feed=feeder.feed(data),
                         fetch_list=[loss])[0]
-        for _ in xrange(100):
+        for _ in range(100):
            val = exe.run(main_program,
                          feed=feeder.feed(data),
                          fetch_list=[loss])[0]

--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -61,13 +61,13 @@ class BaseRNN(object):
        self.num_seq = num_seq
        self.inputs = collections.defaultdict(list)

-        for _ in xrange(num_seq):
+        for _ in range(num_seq):
            seq_len = random.randint(1, max_seq_len - 1)
            for iname in ins:
                ishape = ins[iname].get('shape', None)
                idtype = ins[iname].get('dtype', 'float32')
                lst = []
-                for _ in xrange(seq_len):
+                for _ in range(seq_len):
                    lst.append(numpy.random.random(size=ishape).astype(idtype))
                self.inputs[iname].append(lst)

@@ -96,16 +96,16 @@ class BaseRNN(object):
        for out in self.outputs:
            retv[out] = []

-        for seq_id in xrange(self.num_seq):
+        for seq_id in range(self.num_seq):
            for mname in self.mems:
                self.mems[mname].reset()
            for out in self.outputs:
                self.outputs[out].next_sequence()

-            iname0 = self.inputs.keys()[0]
+            iname0 = list(self.inputs.keys())[0]
            seq_len = len(self.inputs[iname0][seq_id])

-            for step_id in xrange(seq_len):
+            for step_id in range(seq_len):
                xargs = dict()

                for iname in self.inputs:
@@ -138,7 +138,7 @@ class BaseRNN(object):
        for iname in self.inputs:
            lod = []
            np_flatten = []
-            for seq_id in xrange(len(self.inputs[iname])):
+            for seq_id in range(len(self.inputs[iname])):
                seq_len = len(self.inputs[iname][seq_id])
                lod.append(seq_len)
                np_flatten.extend(self.inputs[iname][seq_id])
@@ -159,8 +159,8 @@ class BaseRNN(object):
                             " which is not matrix")
        g = numpy.zeros(shape=p.shape, dtype=p.dtype)

-        for i in xrange(p.shape[0]):
-            for j in xrange(p.shape[1]):
+        for i in range(p.shape[0]):
+            for j in range(p.shape[1]):
                o = p[i][j]
                p[i][j] += delta
                pos = self._exe_mean_out_()
@@ -184,7 +184,7 @@ class BaseRNN(object):
                if len(item.shape) != 1:
                    raise ValueError("Not support")

-                for i in xrange(len(item)):
+                for i in range(len(item)):
                    o = item[i]
                    item[i] += delta
                    pos = self._exe_mean_out_()
@@ -198,14 +198,14 @@ class BaseRNN(object):
        if not return_one_tensor:
            return grad

-        for i in xrange(len(grad)):
+        for i in range(len(grad)):
            grad[i] = numpy.concatenate(grad[i])
        grad = numpy.concatenate(grad)
        return grad

    def _exe_mean_out_(self):
        outs = self.exe()
-        return numpy.array([o.mean() for o in outs.itervalues()]).mean()
+        return numpy.array([o.mean() for o in outs.values()]).mean()


 class SeedFixedTestCase(unittest.TestCase):
@@ -274,13 +274,14 @@ class TestSimpleMul(SeedFixedTestCase):

        cpu = fluid.CPUPlace()
        exe = fluid.Executor(cpu)
-        out, w_g, i_g = map(numpy.array,
-                            exe.run(feed=py_rnn.to_feed(cpu),
-                                    fetch_list=[
-                                        out, self.PARAM_NAME + "@GRAD",
-                                        self.DATA_NAME + "@GRAD"
-                                    ],
-                                    return_numpy=False))
+        out, w_g, i_g = list(
+            map(numpy.array,
+                exe.run(feed=py_rnn.to_feed(cpu),
+                        fetch_list=[
+                            out, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
+                            "@GRAD"
+                        ],
+                        return_numpy=False)))
        out_by_python = py_rnn.exe()[self.OUT_NAME]
        self.assertTrue(numpy.allclose(out, out_by_python))
        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
@@ -351,14 +352,15 @@ class TestSimpleMulWithMemory(SeedFixedTestCase):
        cpu = fluid.CPUPlace()
        exe = fluid.Executor(cpu)
        feed = py_rnn.to_feed(cpu)
-        last_np, w_g, i_g = map(numpy.array,
-                                exe.run(feed=feed,
-                                        fetch_list=[
-                                            last, self.PARAM_NAME + "@GRAD",
-                                            self.DATA_NAME + "@GRAD"
-                                        ],
-                                        return_numpy=False))
-        last_by_py, = py_rnn.exe().values()
+        last_np, w_g, i_g = list(
+            map(numpy.array,
+                exe.run(feed=feed,
+                        fetch_list=[
+                            last, self.PARAM_NAME + "@GRAD", self.DATA_NAME +
+                            "@GRAD"
+                        ],
+                        return_numpy=False)))
+        last_by_py, = list(py_rnn.exe().values())
        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
        self.assertTrue(numpy.allclose(last_np, last_by_py))


--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -67,7 +67,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
    def _lodtensor_to_ndarray(self, lod_tensor):
        dims = lod_tensor.shape()
        ndarray = np.zeros(shape=dims).astype('float32')
-        for i in xrange(np.product(dims)):
+        for i in range(np.product(dims)):
            ndarray.ravel()[i] = lod_tensor._get_float_element(i)
        return ndarray, lod_tensor.recursive_sequence_lengths()

@@ -114,7 +114,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
                shape=[1], dtype='int64', value=0)
            step_idx.stop_gradient = True

-            for i in xrange(self._max_sequence_len):
+            for i in range(self._max_sequence_len):
                step_out = fluid.layers.array_read(static_input_out_array,
                                                   step_idx)
                step_out.stop_gradient = True
@@ -140,27 +140,27 @@ class TestDyRnnStaticInput(unittest.TestCase):
        static_lod = self.static_input_tensor.recursive_sequence_lengths()
        static_sliced = []
        cur_offset = 0
-        for i in xrange(len(static_lod[0])):
+        for i in range(len(static_lod[0])):
            static_sliced.append(self.static_input_data[cur_offset:(
                cur_offset + static_lod[0][i])])
            cur_offset += static_lod[0][i]
        static_seq_len = static_lod[0]
        static_reordered = []
-        for i in xrange(len(x_sorted_indices)):
+        for i in range(len(x_sorted_indices)):
            static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist())
        static_seq_len_reordered = [
            static_seq_len[x_sorted_indices[i]]
-            for i in xrange(len(x_sorted_indices))
+            for i in range(len(x_sorted_indices))
        ]

        static_step_outs = []
        static_step_lods = []

-        for i in xrange(self._max_sequence_len):
+        for i in range(self._max_sequence_len):
            end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1)
            lod = []
            total_len = 0
-            for i in xrange(end):
+            for i in range(end):
                lod.append(static_seq_len_reordered[i])
                total_len += lod[-1]
            static_step_lods.append([lod])
@@ -174,7 +174,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
        static_step_outs = self.build_graph(only_forward=True)
        self.exe.run(framework.default_startup_program())
        expected_outs, expected_lods = self.get_expected_static_step_outs()
-        for i in xrange(self._max_sequence_len):
+        for i in range(self._max_sequence_len):
            step_out, lod = self.fetch_value(static_step_outs[i])
            self.assertTrue(np.allclose(step_out, expected_outs[i]))
            self.assertTrue(np.allclose(lod, expected_lods[i]))
@@ -189,7 +189,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
        numeric_gradients = np.zeros(shape=static_input_shape).astype('float32')
        # calculate numeric gradients
        tensor_size = np.product(static_input_shape)
-        for i in xrange(tensor_size):
+        for i in range(tensor_size):
            origin = self.static_input_tensor._get_float_element(i)
            x_pos = origin + self._delta
            self.static_input_tensor._set_float_element(i, x_pos)

--- a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
@@ -26,7 +26,7 @@ class TestElementWiseAddOp(unittest.TestCase):
        def test_with_place(place):
            out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
            x_grad = out_grad
-            sum_axis = range(0, len(self.x.shape))
+            sum_axis = list(range(0, len(self.x.shape)))
            del sum_axis[self.axis]
            y_grad = np.sum(out_grad, axis=tuple(sum_axis))


--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -38,7 +38,7 @@ class TestGRUOp(OpTest):
        for i in range(len(seq_lens)):
            seq_starts.append(seq_starts[-1] + seq_lens[i])
        sorted_seqs = sorted(
-            range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
+            list(range(len(seq_lens))), lambda x, y: seq_lens[y] - seq_lens[x])
        num_batch = seq_lens[sorted_seqs[0]]
        for batch_idx in range(num_batch):
            idx_in_seq = []
@@ -74,15 +74,16 @@ class TestGRUOp(OpTest):
    def gru(self):
        input, lod = self.inputs['Input']
        w = self.inputs['Weight']
-        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+        b = self.inputs['Bias'] if 'Bias' in self.inputs else np.zeros(
            (1, self.frame_size * 3))
        batch_gate = self.outputs['BatchGate']
        batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
        batch_hidden = self.outputs['BatchHidden']
        hidden = self.outputs['Hidden']
        idx_in_seq_list = self.idx_in_seq_list
-        h_p = self.inputs['H0'][self.sorted_seqs] if self.inputs.has_key(
-            'H0') else np.zeros((len(idx_in_seq_list[0]), self.frame_size))
+        h_p = self.inputs['H0'][
+            self.sorted_seqs] if 'H0' in self.inputs else np.zeros(
+                (len(idx_in_seq_list[0]), self.frame_size))
        num_batch = len(idx_in_seq_list)
        end_idx = 0
        for batch_idx in range(num_batch):

--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
@@ -76,7 +76,7 @@ class TestGRUUnitOp(OpTest):
        x = self.inputs['Input']
        h_p = self.inputs['HiddenPrev']
        w = self.inputs['Weight']
-        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+        b = self.inputs['Bias'] if 'Bias' in self.inputs else np.zeros(
            (1, frame_size * 3))
        g = x + np.tile(b, (batch_size, 1))
        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(

--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
@@ -43,7 +43,7 @@ class TestLayer(unittest.TestCase):
            hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
            fluid.layers.batch_norm(input=hidden2)

-        print str(main_program)
+        print(str(main_program))

    def test_dropout_layer(self):
        main_program = Program()
@@ -53,7 +53,7 @@ class TestLayer(unittest.TestCase):
                name='pixel', shape=[3, 48, 48], dtype='float32')
            fluid.layers.dropout(x=images, dropout_prob=0.5)

-        print str(main_program)
+        print(str(main_program))

    def test_img_conv_group(self):
        main_program = Program()
@@ -65,7 +65,7 @@ class TestLayer(unittest.TestCase):
            conv1 = conv_block(images, 64, 2, [0.3, 0])
            conv_block(conv1, 256, 3, [0.4, 0.4, 0])

-        print str(main_program)
+        print(str(main_program))

    def test_elementwise_add_with_act(self):
        main_program = Program()

--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -48,7 +48,7 @@ class TestBook(unittest.TestCase):

        exe.run(init_program, feed={}, fetch_list=[])

-        for i in xrange(100):
+        for i in range(100):
            tensor_x = np.array(
                [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
            tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")

--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -17,6 +17,7 @@ import numpy as np
 from operator import mul
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+from functools import reduce

 np.random.random(123)


--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -279,7 +279,7 @@ class TestBook(unittest.TestCase):
    def test_nce(self):
        window_size = 5
        words = []
-        for i in xrange(window_size):
+        for i in range(window_size):
            words.append(
                layers.data(
                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
@@ -288,7 +288,7 @@ class TestBook(unittest.TestCase):
        label_word = int(window_size / 2) + 1

        embs = []
-        for i in xrange(window_size):
+        for i in range(window_size):
            if i == label_word:
                continue


--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -36,7 +36,7 @@ class TestLoDRankTable(unittest.TestCase):
        exe.run(scope=scope, feed={'x': tensor})
        var = scope.find_var(rank_table.name)
        table = var.get_lod_rank_table()
-        self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
+        self.assertEqual([(0, 5), (1, 1), (2, 1)], list(table.items()))


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
@@ -24,7 +24,7 @@ class TestLoDTensorArray(unittest.TestCase):
        tensor_array = arr.get_lod_tensor_array()
        self.assertEqual(0, len(tensor_array))
        cpu = core.CPUPlace()
-        for i in xrange(10):
+        for i in range(10):
            t = core.LoDTensor()
            t.set(numpy.array([i], dtype='float32'), cpu)
            t.set_recursive_sequence_lengths([[1]])
@@ -32,7 +32,7 @@ class TestLoDTensorArray(unittest.TestCase):

        self.assertEqual(10, len(tensor_array))

-        for i in xrange(10):
+        for i in range(10):
            t = tensor_array[i]
            self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
            self.assertEqual([[1]], t.recursive_sequence_lengths())

--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -35,8 +35,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
        tensor.set(
            numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
-        expect = map(lambda x: numpy.array(x).astype('int32'),
-                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        expect = [
+            numpy.array(x).astype('int32')
+            for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
+        ]
        self.main(
            tensor=tensor,
            expect_array=expect,
@@ -48,8 +50,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
        tensor.set(
            numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
        tensor.set_recursive_sequence_lengths([[3, 6, 0, 1]])
-        expect = map(lambda x: numpy.array(x).astype('int32'),
-                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        expect = [
+            numpy.array(x).astype('int32')
+            for x in [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]]
+        ]
        self.main(
            tensor=tensor,
            expect_array=expect,
@@ -111,8 +115,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
        expect = [
            numpy.array(
                item, dtype='int32')
-            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range(
-                22, 39) + range(7, 21), range(39, 46)]
+            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], list(
+                range(22, 39)) + list(range(7, 21)), list(range(39, 46))]
        ]
        lod = [[[1, 2, 1], [1, 3, 4, 4]], [[4, 3], [1, 4, 4, 8, 4, 6, 4]],
               [[2], [6, 1]]]

--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -56,7 +56,7 @@ class TestLookupTableOpWithPadding(TestLookupTableOp):
        ids = np.squeeze(self.inputs['Ids'])
        padding_idx = np.random.choice(ids, 1)[0]
        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
-        self.attrs = {'padding_idx': long(padding_idx)}
+        self.attrs = {'padding_idx': int(padding_idx)}
        self.check_output()

    def test_check_grad(self):

--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
@@ -80,7 +80,7 @@ class TestMeanIOUOp(OpTest):
            'InCorrects': in_corrects,
            'InMeanIou': in_mean_ious
        }
-        self.attrs = {'num_classes': long(self.num_classes)}
+        self.attrs = {'num_classes': int(self.num_classes)}
        mean_iou, out_wrong, out_correct = compute_mean_iou(
            predictions, labels, self.num_classes, in_wrongs, in_corrects,
            in_mean_ious)

--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -112,7 +112,7 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,

    if keep_top_k > -1 and num_det > keep_top_k:
        score_index = []
-        for c, indices in selected_indices.iteritems():
+        for c, indices in selected_indices.items():
            for idx in indices:
                score_index.append((scores[c][idx], c, idx))

@@ -143,7 +143,7 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
        lod.append(nmsed_num)
        if nmsed_num == 0: continue

-        for c, indices in nmsed_outs.iteritems():
+        for c, indices in nmsed_outs.items():
            for idx in indices:
                xmin, ymin, xmax, ymax = boxes[n][idx][:]
                det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])

--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
@@ -66,7 +66,7 @@ class TestNCE(OpTest):
        self.attrs = {
            'num_total_classes': num_classes,
            'num_neg_samples': num_neg_samples,
-            'custom_neg_classes': range(num_neg_samples)
+            'custom_neg_classes': list(range(num_neg_samples))
        }
        self.inputs = {
            'Input': input,

--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -28,13 +28,13 @@ class TestOneHotOp(OpTest):
        depth = 10
        dimension = 12
        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])

        out = np.zeros(shape=(np.product(x.shape[:-1]),
                              depth)).astype('float32')

-        for i in xrange(np.product(x.shape)):
+        for i in range(np.product(x.shape)):
            out[i, x[i]] = 1.0

        self.inputs = {'X': (x, x_lod)}
@@ -51,13 +51,13 @@ class TestOneHotOp_default_dtype(OpTest):
        depth = 10
        dimension = 12
        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])

        out = np.zeros(shape=(np.product(x.shape[:-1]),
                              depth)).astype('float32')

-        for i in xrange(np.product(x.shape)):
+        for i in range(np.product(x.shape)):
            out[i, x[i]] = 1.0

        self.inputs = {'X': (x, x_lod)}
@@ -76,7 +76,7 @@ class TestOneHotOp_exception(OpTest):
        self.dimension = 12
        self.x = core.LoDTensor()
        x_lod = [[4, 1, 3, 3]]
-        data = [np.random.randint(11, 20) for i in xrange(sum(x_lod[0]))]
+        data = [np.random.randint(11, 20) for i in range(sum(x_lod[0]))]
        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
        self.x.set(data, self.place)
        self.x.set_recursive_sequence_lengths(x_lod)

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -167,10 +167,10 @@ class TestCRFModel(unittest.TestCase):
                place=fluid.CPUPlace())

            data = train_data()
-            for i in xrange(10):
+            for i in range(10):
                cur_batch = next(data)
-                print pe.run(feed=feeder.feed(cur_batch),
-                             fetch_list=[avg_cost.name])[0]
+                print(pe.run(feed=feeder.feed(cur_batch),
+                             fetch_list=[avg_cost.name])[0])

    @unittest.skip(reason="CI hangs")
    def test_update_sparse_parameter_all_reduce(self):

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -71,7 +71,7 @@ class TestFetchOp(unittest.TestCase):

            fetch_list = []
            all_vars = main.global_block().vars
-            for k, v in all_vars.iteritems():
+            for k, v in all_vars.items():
                if 'tmp' not in k and k[0] is not '_' or v.persistable:
                    fetch_list.append(k)

@@ -90,7 +90,7 @@ class TestFetchOp(unittest.TestCase):
        iters = 3
        train_inputs = []
        for i in range(iters):
-            train_inputs.append(tst_reader_iter.next())
+            train_inputs.append(next(tst_reader_iter))

        os.environ['CPU_NUM'] = str(4)
        if core.is_compiled_with_cuda():
@@ -133,7 +133,7 @@ class TestFeedParallel(unittest.TestCase):

        for batch_id, data in enumerate(reader()):
            loss_np = pe.run(feed=data, fetch_list=[loss.name])[0]
-            print batch_id, loss_np
+            print(batch_id, loss_np)
            if batch_id == 2:
                break


--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -37,7 +37,7 @@ def simple_fc_net(use_feed):
        reader = fluid.layers.io.double_buffer(reader)
        img, label = fluid.layers.read_file(reader)
    hidden = img
-    for _ in xrange(4):
+    for _ in range(4):
        hidden = fluid.layers.fc(
            hidden,
            size=200,
@@ -64,7 +64,7 @@ def fc_with_batchnorm(use_feed):
        img, label = fluid.layers.read_file(reader)

    hidden = img
-    for _ in xrange(1):
+    for _ in range(1):
        hidden = fluid.layers.fc(
            hidden,
            size=200,
@@ -128,9 +128,9 @@ class TestMNIST(TestParallelExecutorBase):
            use_reduce=True)

        for loss in zip(all_reduce_first_loss, reduce_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-6)
        for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
+            self.assertAlmostEqual(loss[0], loss[1], delta=1e-4)

    # simple_fc
    def check_simple_fc_convergence(self, use_cuda, use_reduce=False):

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -25,7 +25,7 @@ def simple_fc_net():
    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    hidden = img
-    for _ in xrange(4):
+    for _ in range(4):
        hidden = fluid.layers.fc(
            hidden,
            size=200,
@@ -71,7 +71,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                share_vars_from=train_exe,
                build_strategy=build_strategy)

-            for i in xrange(5):
+            for i in range(5):
                test_loss, = test_exe.run([loss.name], feed=feed_dict)

                train_loss, = train_exe.run([loss.name], feed=feed_dict)

--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -18,6 +18,7 @@ import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
 import paddle.fluid.profiler as profiler
 import numpy
+import six


 class BaseParallelForTest(unittest.TestCase):
@@ -25,20 +26,20 @@ class BaseParallelForTest(unittest.TestCase):
        """
        Run the unittest for parallel.for
        Args:
-            callback(callable): A callable function returns a generator. There 
-                are two yields in the generator function. The first yield 
-                returns the data layers, and the second yield returns the loss. 
-                The modified data variables will be sent back during the first 
+            callback(callable): A callable function returns a generator. There
+                are two yields in the generator function. The first yield
+                returns the data layers, and the second yield returns the loss.
+                The modified data variables will be sent back during the first
                yield.

            feed(dict): The executor feeding dictionary.
-            fetch(list|basestr): The fetch name lists. 
+            fetch(list|basestr): The fetch name lists.

        Returns:
            None

        Raises:
-            AssertionError when the computation of cpu, parallel.for in cpu, 
+            AssertionError when the computation of cpu, parallel.for in cpu,
                gpu, parallel.for in gpu are different.

        """
@@ -95,14 +96,14 @@ class BaseParallelForTest(unittest.TestCase):
        """
        Run a single test, returns the fetch values
        Args:
-            place(Place): the computation place. 
-            use_parallel(bool): Whether use parallel.for or not. 
+            place(Place): the computation place.
+            use_parallel(bool): Whether use parallel.for or not.

        Returns:
            Fetched numpy arrays.

        """
-        if isinstance(fetch, basestring):
+        if isinstance(fetch, six.string_types):
            fetch = [fetch]
        main = fluid.Program()
        startup = fluid.Program()
@@ -124,7 +125,7 @@ class BaseParallelForTest(unittest.TestCase):
                    data = [data]

                with pd.do():
-                    ins = map(pd.read_input, data)
+                    ins = list(map(pd.read_input, data))
                    if len(ins) == 1:
                        ins = ins[0]
                    loss = generator.send(ins)  # patch input
@@ -156,7 +157,7 @@ class BaseParallelForTest(unittest.TestCase):

        Returns:
            None
-            
+
        Raises:
            AssertionError


--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
@@ -23,9 +23,9 @@ def PolygonBoxRestore(input):
    geo_channels = shape[1]
    h = shape[2]
    w = shape[3]
-    h_indexes = np.array(range(h) * w).reshape(
+    h_indexes = np.array(list(range(h)) * w).reshape(
        [w, h]).transpose()[np.newaxis, :]  # [1, h, w]
-    w_indexes = np.array(range(w) * h).reshape(
+    w_indexes = np.array(list(range(w)) * h).reshape(
        [h, w])[np.newaxis, :]  # [1, h, w]
    indexes = np.concatenate(
        (w_indexes, h_indexes))[np.newaxis, :]  # [1, 2, h, w]

--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -35,8 +35,8 @@ def max_pool2D_forward_naive(x,
             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
                                                   paddings[1]) / strides[1] + 1
    out = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
            r_start = np.max((i * strides[0] - paddings[0], 0))
            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
            c_start = np.max((j * strides[1] - paddings[1], 0))
@@ -63,8 +63,8 @@ def avg_pool2D_forward_naive(x,
             ) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
                                                   paddings[1]) / strides[1] + 1
    out = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
            r_start = np.max((i * strides[0] - paddings[0], 0))
            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
            c_start = np.max((j * strides[1] - paddings[1], 0))

--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -38,13 +38,13 @@ def max_pool3D_forward_naive(x,
             ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
                                                   paddings[2]) / strides[2] + 1
    out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
        d_start = np.max((k * strides[0] - paddings[0], 0))
        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
            h_start = np.max((i * strides[0] - paddings[0], 0))
            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                w_start = np.max((j * strides[1] - paddings[1], 0))
                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
@@ -72,13 +72,13 @@ def avg_pool3D_forward_naive(x,
             ) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
                                                   paddings[2]) / strides[2] + 1
    out = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
        d_start = np.max((k * strides[0] - paddings[0], 0))
        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
            h_start = np.max((i * strides[0] - paddings[0], 0))
            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                w_start = np.max((j * strides[1] - paddings[1], 0))
                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]

--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
@@ -29,21 +29,21 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
    out = np.zeros((N, C, D_out, H_out, W_out))
    mask = np.zeros((N, C, D_out, H_out, W_out))
-    for k in xrange(D_out):
+    for k in range(D_out):
        d_start = np.max((k * strides[0] - paddings[0], 0))
        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
-        for i in xrange(H_out):
+        for i in range(H_out):
            h_start = np.max((i * strides[0] - paddings[0], 0))
            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
-            for j in xrange(W_out):
+            for j in range(W_out):
                w_start = np.max((j * strides[1] - paddings[1], 0))
                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]

                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))

-                for n in xrange(N):
-                    for c in xrange(C):
+                for n in range(N):
+                    for c in range(C):
                        arr = x_masked[n, c, :, :, :]
                        index = np.where(arr == np.max(arr))
                        sub_deep = index[0][0]
@@ -67,8 +67,8 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
    out = np.zeros((N, C, H_out, W_out))
    mask = np.zeros((N, C, H_out, W_out))
-    for i in xrange(H_out):
-        for j in xrange(W_out):
+    for i in range(H_out):
+        for j in range(W_out):
            r_start = np.max((i * strides[0] - paddings[0], 0))
            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
            c_start = np.max((j * strides[1] - paddings[1], 0))
@@ -77,8 +77,8 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):

            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))

-            for n in xrange(N):
-                for c in xrange(C):
+            for n in range(N):
+                for c in range(C):
                    arr = x_masked[n, c, :, :]
                    index = np.where(arr == np.max(arr))
                    sub_row = index[0][0]

--- a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
+++ b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
@@ -32,7 +32,7 @@ def py_pnpair_op(score, label, query, column=-1, weight=None):

    # accumulate statistics
    pos, neg, neu = 0, 0, 0
-    for _, ranks in predictions.items():
+    for _, ranks in list(predictions.items()):
        for e1, e2 in itertools.combinations(ranks, 2):
            s1, s2, l1, l2, w1, w2 = e1[0], e2[0], e1[1], e2[1], e1[2], e2[2]
            w = (w1 + w2) * 0.5

--- a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
+++ b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
@@ -39,19 +39,19 @@ def get_states(idxs, labels, cls_num, weights=None):
    ins_num = idxs.shape[0]
    # TP FP TN FN
    states = np.zeros((cls_num, 4)).astype('float32')
-    for i in xrange(ins_num):
+    for i in range(ins_num):
        w = weights[i] if weights is not None else 1.0
        idx = idxs[i][0]
        label = labels[i][0]
        if idx == label:
            states[idx][0] += w
-            for j in xrange(cls_num):
+            for j in range(cls_num):
                states[j][2] += w
            states[idx][2] -= w
        else:
            states[label][3] += w
            states[idx][1] += w
-            for j in xrange(cls_num):
+            for j in range(cls_num):
                states[j][2] += w
            states[label][2] -= w
            states[idx][2] -= w
@@ -64,7 +64,7 @@ def compute_metrics(states, cls_num):
    total_fn_count = 0.0
    macro_avg_precision = 0.0
    macro_avg_recall = 0.0
-    for i in xrange(cls_num):
+    for i in range(cls_num):
        total_tp_count += states[i][0]
        total_fp_count += states[i][1]
        total_fn_count += states[i][3]
@@ -90,9 +90,9 @@ class TestPrecisionRecallOp_0(OpTest):
        ins_num = 64
        cls_num = 10
        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
            (ins_num, 1)).astype('int32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
            (ins_num, 1)).astype('int32')
        states = get_states(idxs, labels, cls_num)
        metrics = compute_metrics(states, cls_num)
@@ -117,10 +117,10 @@ class TestPrecisionRecallOp_1(OpTest):
        ins_num = 64
        cls_num = 10
        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
            (ins_num, 1)).astype('int32')
        weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
            (ins_num, 1)).astype('int32')

        states = get_states(idxs, labels, cls_num, weights)
@@ -151,10 +151,10 @@ class TestPrecisionRecallOp_2(OpTest):
        ins_num = 64
        cls_num = 10
        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+        idxs = np.random.choice(range(cls_num), ins_num).reshape(
            (ins_num, 1)).astype('int32')
        weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
-        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+        labels = np.random.choice(range(cls_num), ins_num).reshape(
            (ins_num, 1)).astype('int32')
        states = np.random.randint(0, 30, (cls_num, 4)).astype('float32')


--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -183,7 +183,7 @@ class TestBlockDesc(unittest.TestCase):
        op2 = block.append_op()
        op0 = block._prepend_op()
        all_ops = []
-        for idx in xrange(0, block.op_size()):
+        for idx in range(0, block.op_size()):
            all_ops.append(block.op(idx))
        self.assertEqual(all_ops, [op0, op1, op2])

@@ -205,7 +205,7 @@ class TestBlockDesc(unittest.TestCase):
        program._sync_with_cpp()

        all_ops = []
-        for idx in xrange(0, block.op_size()):
+        for idx in range(0, block.op_size()):
            all_ops.append(block.op(idx))
        self.assertEqual(all_ops, [op0, op2])


--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -21,7 +21,7 @@ import unittest
 class TestReaderReset(unittest.TestCase):
    def prepare_data(self):
        def fake_data_generator():
-            for n in xrange(self.total_ins_num):
+            for n in range(self.total_ins_num):
                yield np.ones(self.ins_shape) * n, n

        # Prepare data

--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -203,12 +203,12 @@ class RecurrentOpTest1(unittest.TestCase):
                    num_grad[idx], ana_grad[idx], rtol=0.1).all())

    def check_forward(self):
-        print 'test recurrent op forward'
+        print('test recurrent op forward')
        pd_output = self.forward()
        py_output = self.py_rnn.forward()
-        print 'pd_output', pd_output
+        print('pd_output', pd_output)
        print
-        print 'py_output', py_output
+        print('py_output', py_output)
        self.assertEqual(pd_output.shape, py_output.shape)
        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all())

@@ -445,7 +445,7 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
        self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
                                                            self.output_shape)
        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
-        print self.main_program
+        print(self.main_program)

    def create_rnn_op(self):
        x = layers.data(

--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -26,9 +26,9 @@ class TestSeqProject(OpTest):
        if self.context_length == 1 \
                and self.context_start == 0 \
                and self.padding_trainable:
-            print "If context_start is 0 " \
+            print("If context_start is 0 " \
                  "and context_length is 1," \
-                  " padding_trainable should be false."
+                  " padding_trainable should be false.")
            return

        # one level, batch size
@@ -212,7 +212,7 @@ class TestSeqProjectCase2(TestSeqProject):
        self.context_stride = 1

        self.input_size = [self.input_row, 23]
-        idx = range(self.input_size[0])
+        idx = list(range(self.input_size[0]))
        del idx[0]
        offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
                      [self.input_size[0]]]

--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -44,7 +44,7 @@ class TestSequenceExpand(OpTest):
            out_lod = [[]]

        offset = 0
-        for i in xrange(len(y_lod[ref_level])):
+        for i in range(len(y_lod[ref_level])):
            repeat_num = y_lod[ref_level][i]
            x_len = x_idx[i]

@@ -55,7 +55,7 @@ class TestSequenceExpand(OpTest):
                    stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
                out = np.vstack((out, stacked_x_sub))
                if x_lod is not None:
-                    for j in xrange(repeat_num):
+                    for j in range(repeat_num):
                        out_lod[0].append(x_len)
            offset += x_len


--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
@@ -35,7 +35,7 @@ class TestSequenceReshape(OpTest):
    def compute_output(self, x, x_lod, dimension):
        x_width = x.shape[1]
        out_lod = [[]]
-        for i in xrange(len(x_lod[0])):
+        for i in range(len(x_lod[0])):
            seq_len = x_lod[0][i]
            offset = (seq_len * x_width) / dimension
            assert int(offset) * dimension == seq_len * x_width

--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -48,7 +48,7 @@ class TestShrinkRNNMemoryBase(unittest.TestCase):

    def sum_lodtensor(self, tensor):
        sum_res = 0.0
-        for i in xrange(np.product(tensor.shape())):
+        for i in range(np.product(tensor.shape())):
            sum_res += tensor._get_float_element(i)
        return sum_res


--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -26,7 +26,7 @@ class TestSplitOp(OpTest):
        self.inputs = {'X': x}
        self.attrs = {'axis': axis, 'sections': [2, 1, 2]}
        self.outputs = {'Out': [('out%d' % i, out[i]) \
-            for i in xrange(len(out))]}
+            for i in range(len(out))]}

    def _set_op_type(self):
        self.op_type = "split"

--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -53,7 +53,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
        height_sections = [5, 5, 5, 5, 3]

        # initialize output variables [out0, out1]
-        outs_name = ["out%d" % i for i in xrange(len(height_sections))]
+        outs_name = ["out%d" % i for i in range(len(height_sections))]
        outs = [
            scope.var(var_name).get_selected_rows() for var_name in outs_name
        ]

--- a/python/paddle/fluid/tests/unittests/test_spp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spp_op.py
@@ -26,7 +26,7 @@ class TestSppOp(OpTest):
        input = np.random.random(self.shape).astype("float32")
        nsize, csize, hsize, wsize = input.shape
        out_level_flatten = []
-        for i in xrange(self.pyramid_height):
+        for i in range(self.pyramid_height):
            bins = np.power(2, i)
            kernel_size = [0, 0]
            padding = [0, 0]

--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
@@ -28,7 +28,7 @@ class TestTopkOp(OpTest):
        self.inputs = {'X': input}
        self.attrs = {'k': k}

-        for rowid in xrange(32):
+        for rowid in range(32):
            row = input[rowid]
            output[rowid] = np.sort(row)[-k:]
            indices[rowid] = row.argsort()[-k:]
@@ -52,7 +52,7 @@ class TestTopkOp3d(OpTest):
        self.inputs = {'X': input_flat_2d}
        self.attrs = {'k': k}

-        for rowid in xrange(64):
+        for rowid in range(64):
            row = input_flat_2d[rowid]
            output[rowid] = np.sort(row)[-k:]
            indices[rowid] = row.argsort()[-k:]

--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -22,10 +22,10 @@ def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
    out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
    out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
    out = np.zeros((s0, s1, out_hsize, out_wsize))
-    for nidx in xrange(s0):
-        for cidx in xrange(s1):
-            for h in xrange(s2):
-                for w in xrange(s3):
+    for nidx in range(s0):
+        for cidx in range(s1):
+            for h in range(s2):
+                for w in range(s3):
                    index = indices[nidx, cidx, h, w]
                    hidx = (index - index % out_wsize) / out_wsize
                    widx = index % out_wsize
@@ -47,16 +47,16 @@ class TestUnpoolOp(OpTest):
                self.strides[1] + 1
        input = np.zeros((nsize, csize, hsize_out, wsize_out))
        indices = np.zeros((nsize, csize, hsize_out, wsize_out))
-        for i in xrange(hsize_out):
-            for j in xrange(wsize_out):
+        for i in range(hsize_out):
+            for j in range(wsize_out):
                r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
                r_end = np.min((i * self.strides[0] + self.ksize[0] - \
                        self.paddings[0], hsize))
                c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
                c_end = np.min((j * self.strides[1] + self.ksize[1] - \
                        self.paddings[1], wsize))
-                for nidx in xrange(nsize):
-                    for cidx in xrange(csize):
+                for nidx in range(nsize):
+                    for cidx in range(csize):
                        x_masked = pre_input[nidx, cidx, r_start:r_end, \
                                c_start:c_end]
                        input[nidx, cidx, i, j] = x_masked.max()

--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -66,7 +66,7 @@ class TestWhileOp(unittest.TestCase):
        exe = Executor(cpu)
        d = []

-        for i in xrange(3):
+        for i in range(3):
            d.append(numpy.random.random(size=[10]).astype('float32'))

        outs = exe.run(feed={'d0': d[0],

--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -150,7 +150,7 @@ def append_input_output(block, op_proto, np_list, is_input, dtype):


 def append_loss_ops(block, output_names):
-    mean_inputs = map(block.var, output_names)
+    mean_inputs = list(map(block.var, output_names))
    # for item in mean_inputs:
    #     print(item)
    #     print("Item", item.dtype)

--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -118,8 +118,9 @@ def multi_head_attention(queries,
        # FIXME(guosheng): Decouple the program desc with batch_size.
        return layers.reshape(
            x=trans_x,
-            shape=map(int,
-                      [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=list(
+                map(int, [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]
+                          ])))

    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """

--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -18,16 +18,15 @@ import errno
 import shutil
 import time

-import core
-
-import data_feeder
-import executor
-import framework
-import io
+from . import core
+from . import data_feeder
+from . import executor
+from . import framework
+from . import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
-import optimizer as opt_module
-import parallel_executor
-from transpiler import distribute_transpiler
+from . import optimizer as opt_module
+from . import parallel_executor
+from .transpiler import distribute_transpiler

 __all__ = [
    'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
@@ -73,7 +72,7 @@ class BeginStepEvent(object):
        self.step = step_id
        self.fetch_metrics = True
        """
-        If fetch_metrics is true, the metrics will be fetched at the 
+        If fetch_metrics is true, the metrics will be fetched at the
        EndStepEvent. Default is True.
        """

@@ -614,11 +613,12 @@ def build_feed_var_list(program, feed_order):
        if not isinstance(feed_order, dict):
            raise TypeError(
                "The 'feed_order' should be either None, list or dict.")
-        if not sorted(feed_order.values()) == range(len(feed_order)):
+        if not sorted(feed_order.values()) == list(range(len(feed_order))):
            raise ValueError(
                "The values of 'feed_order' should be a permutation of [0, len(feed_order))"
            )
-        sorted_pair_list = sorted(feed_order.items(), key=lambda item: item[1])
+        sorted_pair_list = sorted(
+            list(feed_order.items()), key=lambda item: item[1])
        feed_var_list = [
            program.global_block().var(pair[0]) for pair in sorted_pair_list
        ]
@@ -644,14 +644,14 @@ def save_checkpoint(executor,
                    pserver_endpoints=None):
    """
    This function filters out all checkpoint variables from the give
-    main_program and then saves these variables to the `checkpoint_dir` 
+    main_program and then saves these variables to the `checkpoint_dir`
    directory.

    In the training precess, we generally save a checkpoint in each
-    iteration. So there might be a lot of checkpoints in the 
-    `checkpoint_dir`. To avoid them taking too much disk space, the 
-    `max_num_checkpoints` are introduced to limit the total number of 
-    checkpoints. If the number of existing checkpints is greater than 
+    iteration. So there might be a lot of checkpoints in the
+    `checkpoint_dir`. To avoid them taking too much disk space, the
+    `max_num_checkpoints` are introduced to limit the total number of
+    checkpoints. If the number of existing checkpints is greater than
    the `max_num_checkpoints`, oldest ones will be scroll deleted.

    A variable is a checkpoint variable and will be saved if it meets
@@ -663,21 +663,21 @@ def save_checkpoint(executor,
    Args:
        executor(Executor): The executor to run for save checkpoint.
        checkpoint_dir(str): The folder where to save checkpoints.
-        trainer_id(int): currect trainer id, if id is equal to 0, the trainer 
+        trainer_id(int): currect trainer id, if id is equal to 0, the trainer
            is chief.
-        trainer_args(dict|None): Current training arguments. Such as 'epoch_id' 
+        trainer_args(dict|None): Current training arguments. Such as 'epoch_id'
            and 'step_id'.
            Defaut: None
        main_program(Program): The program whose checkpoint variables will
            be saved.
-        max_num_checkpoints(int): The max number of total number of existing 
+        max_num_checkpoints(int): The max number of total number of existing
            checkpoints.
            Default: 3
        lookup_table(string|None): the lookup table name, when use distribute
            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name 
-        pserver_endpoints(list|None): the parameter server ip:port list.  
-            when use distribute lookup table, we can get pserver_endpoints by 
+            table_name
+        pserver_endpoints(list|None): the parameter server ip:port list.
+            when use distribute lookup table, we can get pserver_endpoints by
            distribute arguments.

    Returns:
@@ -747,8 +747,8 @@ def load_checkpoint(executor,
    `checkpoint_dir` directory.

    In the training precess, we generally save a checkpoint in each
-    iteration. So there are more than one checkpoint in the 
-    `checkpoint_dir` (each checkpoint has its own sub folder), use 
+    iteration. So there are more than one checkpoint in the
+    `checkpoint_dir` (each checkpoint has its own sub folder), use
    `serial` to specify which serial of checkpoint you would like to
    load.

@@ -819,9 +819,9 @@ def load_checkpoint(executor,

 def clean_checkpoint(checkpoint_dir, delete_dir=False):
    """
-    clean the checkpoint dir, when the train exits normally, 
+    clean the checkpoint dir, when the train exits normally,
    the trainer will call clean_checkpoint to delete checkpoint directory saved before.
-    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
+    delete_dir only works when the directory is empty, otherwise, OSError is raised.

    : param checkpoint_dir
    : param delete_dir
@@ -889,7 +889,7 @@ def _load_persist_vars_without_grad(executor,

 def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
    """
-    The parameter server will load lookup table's local file in 
+    The parameter server will load lookup table's local file in
    selectedrows variable.

    Args:
@@ -940,7 +940,7 @@ def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
 def _save_persist_vars_without_grad(executor, dirname, program):
    """
    This function filters out all checkpoint variables from the give
-    program and then save these variables to a sub-folder '__model__' of 
+    program and then save these variables to a sub-folder '__model__' of
    the given directory.

    A variable is a checkpoint variable if it meets all following
@@ -969,7 +969,7 @@ def _save_persist_vars_without_grad(executor, dirname, program):

            # In this example, `_save_persist_vars_without_grad` function
            # will first filters out all checkpoint variables in the default
-            # main program, and then saves these variables to the folder 
+            # main program, and then saves these variables to the folder
            # "./my_paddle_model/__model__".
    """
    cur_dir = _get_model_dir(dirname)
@@ -988,7 +988,7 @@ def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
    """
    This function will send checkpoint notify message from Trainer 0
    to all the pservers.
-    The checkpoint notify message contains lookup table name, 
+    The checkpoint notify message contains lookup table name,
    the absolute path on pserver to save lookup_table.

    Args:
@@ -996,13 +996,13 @@ def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
        dirname(str): The folder where to save checkpoints.
        lookup_table(string): the lookup table name, when use distribute
            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name 
-        ps_endpoint_list(list): the parameter server ip:port list.  
-            when use distribute lookup table, we can get ps_endpoint_list by 
+            table_name
+        ps_endpoint_list(list): the parameter server ip:port list.
+            when use distribute lookup table, we can get ps_endpoint_list by
            distribute arguments.
    Return:
        None
-    
+
    Examples:
        .. code-block:: python

@@ -1013,7 +1013,7 @@ def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]

            _save_pserver_vars_by_notify(executor=exe,
-                    dirname=param_path, lookup_table=table_name, 
+                    dirname=param_path, lookup_table=table_name,
                    ps_endpoint_list=ps_endpoints)
    """
    cur_dir = _get_lookuptable_dir(dirname)
@@ -1036,7 +1036,7 @@ def _save_trainer_args(dirname, trainer_id, trainer_args):

    cur_dir = _get_trainer_dir(dirname, trainer_id)

-    for name, value in trainer_args.iteritems():
+    for name, value in list(trainer_args.items()):
        args_file = os.path.join(cur_dir, name)
        with open(args_file, 'w') as f:
            f.write(str(value))
@@ -1045,7 +1045,7 @@ def _save_trainer_args(dirname, trainer_id, trainer_args):

 def _load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
    """
-    trainer will load some args from it's independent directory, 
+    trainer will load some args from it's independent directory,
    such as epoch_id and step_id.

    Args:
@@ -1168,10 +1168,10 @@ def _scroll_delete(dirname, max_num_checkpoints=3):
        serial_num = _get_dir_serial(serial)
        serial_map[serial_num] = serial

-    if len(serial_map.keys()) <= max_num_checkpoints:
+    if len(list(serial_map.keys())) <= max_num_checkpoints:
        return

-    serials = serial_map.keys()
+    serials = list(serial_map.keys())
    serials.sort(reverse=True)
    serials = serials[max_num_checkpoints:]
    for serial in serials:

--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
-from inference_transpiler import InferenceTranspiler
-from memory_optimization_transpiler import memory_optimize, release_memory
-from ps_dispatcher import HashName, RoundRobin
+from .distribute_transpiler import DistributeTranspiler, DistributeTranspilerConfig
+from .inference_transpiler import InferenceTranspiler
+from .memory_optimization_transpiler import memory_optimize, release_memory
+from .ps_dispatcher import HashName, RoundRobin

 __all__ = [
    "DistributeTranspiler", "InferenceTranspiler", "memory_optimize",

--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -12,5 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from program_utils import *
-from ufind import *
+from .program_utils import *
+from .ufind import *
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -17,8 +17,8 @@ def delete_ops(block, ops):
    try:
        start = list(block.ops).index(ops[0])
        end = list(block.ops).index(ops[-1])
-        [block._remove_op(start) for _ in xrange(end - start + 1)]
-    except Exception, e:
+        [block._remove_op(start) for _ in range(end - start + 1)]
+    except Exception as e:
        raise e
    block.program._sync_with_cpp()


--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -28,18 +28,17 @@ Steps to transpile pserver:
 5. add listen_and_serv op
 """

-from __future__ import print_function
-
 import math
 import random
 import numpy as np

-from ps_dispatcher import RoundRobin, HashName, PSDispatcher
+from .ps_dispatcher import RoundRobin, HashName, PSDispatcher
 from .. import core, framework
 from ..framework import Program, default_main_program, \
                        default_startup_program, Block, \
                        Parameter, grad_var_name
-from details import *
+from .details import *
+from functools import reduce

 LOOKUP_TABLE_TYPE = "lookup_table"
 LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
@@ -102,7 +101,7 @@ def slice_variable(var_list, slice_count, min_block_size):
                block_size += dim1 - remains
        # update split_count after aligning
        split_count = int(math.ceil(var_numel / float(block_size)))
-        for block_id in xrange(split_count):
+        for block_id in range(split_count):
            curr_block_size = min(block_size, var_numel - (
                (block_id) * block_size))
            block = VarBlock(var.name, block_id, curr_block_size)
@@ -117,7 +116,7 @@ class DistributeTranspilerConfig(object):
        try to choose the best method to balance loads for pservers.
    min_block_size (int): Minimum splitted element number in block.
        According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
-        We can use bandwidth effiently when data size is larger than 2MB.If you 
+        We can use bandwidth effiently when data size is larger than 2MB.If you
        want to change it, please be sure you see the slice_variable function.
    """

@@ -218,7 +217,7 @@ class DistributeTranspiler(object):
        #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
        #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
        # shuffle the map will avoid the uneven distribution above
-        grad_var_mapping_items = self.grad_var_mapping.items()
+        grad_var_mapping_items = list(self.grad_var_mapping.items())
        if not self.config.slice_var_up:
            random.seed(self.trainer_num)
            random.shuffle(grad_var_mapping_items)
@@ -278,7 +277,7 @@ class DistributeTranspiler(object):
            self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])

        # step4: Concat the parameters splits together after recv.
-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in list(self.param_var_mapping.items()):
            eps = []
            for var in splited_var:
                index = [v.name for v in recv_vars].index(var.name)
@@ -303,7 +302,7 @@ class DistributeTranspiler(object):
                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                })

-        for varname, splited_var in self.param_var_mapping.iteritems():
+        for varname, splited_var in list(self.param_var_mapping.items()):
            if len(splited_var) <= 1:
                continue
            orig_param = program.global_block().vars[varname]
@@ -373,7 +372,7 @@ class DistributeTranspiler(object):
                    dtype=v.dtype,
                    shape=v.shape)
            if self.sync_mode and self.trainer_num > 1:
-                for trainer_id in xrange(self.trainer_num):
+                for trainer_id in range(self.trainer_num):
                    var = pserver_program.global_block().create_var(
                        name="%s.trainer_%d" % (orig_var_name, trainer_id),
                        persistable=False,
@@ -463,7 +462,7 @@ class DistributeTranspiler(object):
            per_opt_block = pserver_program.create_block(pre_block_idx)
            optimize_blocks.append(per_opt_block)
            # append grad merging ops before clip and weight decay
-            # cases may like: 
+            # cases may like:
            # L2Decay op -> clip op -> optimize
            for _, op in enumerate(self.optimize_ops):
                # find the origin @GRAD var before clipping
@@ -560,7 +559,7 @@ class DistributeTranspiler(object):
        # 1. create vars in pserver program to startup program
        pserver_vars = pserver_program.global_block().vars
        created_var_map = dict()
-        for _, var in pserver_vars.iteritems():
+        for _, var in list(pserver_vars.items()):
            tmpvar = s_prog.global_block()._clone_variable(var)
            created_var_map[var.name] = tmpvar

@@ -992,11 +991,11 @@ class DistributeTranspiler(object):
        var_mapping = dict()
        for block_str in block_list:
            varname, offset, size = block_str.split(":")
-            if not block_map.has_key(varname):
+            if varname not in block_map:
                block_map[varname] = []
-            block_map[varname].append((long(offset), long(size)))
+            block_map[varname].append((int(offset), int(size)))

-        for varname, splited in block_map.iteritems():
+        for varname, splited in list(block_map.items()):
            orig_var = program.global_block().var(varname)
            if len(splited) == 1:
                if self.sync_mode and add_trainer_suffix:
@@ -1159,7 +1158,7 @@ class DistributeTranspiler(object):
        grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
        if self.sync_mode and self.trainer_num > 1:
            vars2merge = []
-            for i in xrange(self.trainer_num):
+            for i in range(self.trainer_num):
                per_trainer_name = "%s.trainer_%d" % \
                (merged_var_name, i)
                vars2merge.append(pserver_block.vars[per_trainer_name])
@@ -1207,7 +1206,7 @@ class DistributeTranspiler(object):
                # learning rate variable has already be created by non-optimize op,
                # don't create it once again.
                lr_varname = opt_op.input(key)[0]
-                if pserver_block.vars.has_key(lr_varname):
+                if lr_varname in pserver_block.vars:
                    new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]]
                else:
                    origin_var = origin_program.global_block().vars[lr_varname]
@@ -1247,7 +1246,9 @@ class DistributeTranspiler(object):

    def _is_splited_grad_var(self, var, var_dict):
        grad_block = None
-        for _, g in var_dict.iteritems():
+        # TODO(minqiyang): replace these items() with six.iteritems() to
+        # improve memory
+        for _, g in list(var_dict.items()):
            if self._orig_varname(g.name) == self._orig_varname(var.name):
                if g.name.find(".trainer_") == -1:
                    grad_block = g
@@ -1257,7 +1258,7 @@ class DistributeTranspiler(object):
    def _clone_lr_op(self, program, block, op):
        inputs = self._get_input_map_from_op(
            self.origin_program.global_block().vars, op)
-        for key, varlist in inputs.iteritems():
+        for key, varlist in list(inputs.items()):
            if not isinstance(varlist, list):
                varlist = [varlist]
            for var in varlist:
@@ -1266,7 +1267,7 @@ class DistributeTranspiler(object):

        outputs = self._get_output_map_from_op(
            self.origin_program.global_block().vars, op)
-        for key, varlist in outputs.iteritems():
+        for key, varlist in list(outputs.items()):
            if not isinstance(varlist, list):
                varlist = [varlist]
            for var in varlist:
@@ -1281,7 +1282,7 @@ class DistributeTranspiler(object):
        # Append the ops for parameters that do not need to be optimized/updated
        inputs = self._get_input_map_from_op(
            self.origin_program.global_block().vars, opt_op)
-        for key, varlist in inputs.iteritems():
+        for key, varlist in list(inputs.items()):
            if not isinstance(varlist, list):
                varlist = [varlist]
            for var in varlist:
@@ -1291,7 +1292,7 @@ class DistributeTranspiler(object):
                    var, program.global_block().vars)
                if grad_block:
                    inputs[key] = grad_block
-                elif not program.global_block().vars.has_key(var.name):
+                elif var.name not in program.global_block().vars:
                    program.global_block().create_var(
                        name=var.name,
                        persistable=var.persistable,
@@ -1300,7 +1301,7 @@ class DistributeTranspiler(object):

        outputs = self._get_output_map_from_op(
            self.origin_program.global_block().vars, opt_op)
-        for key, varlist in outputs.iteritems():
+        for key, varlist in list(outputs.items()):
            if not isinstance(varlist, list):
                varlist = [varlist]
            for var in varlist:
@@ -1308,7 +1309,7 @@ class DistributeTranspiler(object):
                    var, program.global_block().vars)
                if grad_block:
                    outputs[key] = grad_block
-                elif not program.global_block().vars.has_key(var.name):
+                elif var.name not in program.global_block().vars:
                    program.global_block()._clone_variable(var)

        return optimize_block.append_op(
@@ -1329,8 +1330,8 @@ class DistributeTranspiler(object):
    def _create_ufind(self, optimize_ops):
        # Create a unit find data struct by optimize ops
        ufind = UnionFind(optimize_ops)
-        for i in xrange(len(optimize_ops)):
-            for j in xrange(i, len(optimize_ops)):
+        for i in range(len(optimize_ops)):
+            for j in range(i, len(optimize_ops)):
                op1 = optimize_ops[i]
                op2 = optimize_ops[j]
                if self._is_op_connected(op1, op2):

--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -305,6 +305,6 @@ class InferenceTranspiler(object):
            args += current_op.output_arg_names
        args = list(set(args))  # unique the input and output arguments

-        for var in self.block.vars.keys():
+        for var in list(self.block.vars.keys()):
            if var not in args:
                self.block._remove_var(var)
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -16,6 +16,8 @@ from collections import defaultdict
 from .. import core
 from ..framework import Program, default_main_program, Parameter
 from ..backward import _rename_arg_
+from functools import reduce
+from six.moves import range

 dtype_to_size = {
    core.VarDesc.VarType.FP16: 2,
@@ -107,7 +109,7 @@ class ControlFlowGraph(object):
        # Repeatedly apply liveness updates until the algorithm stablize
        # on a complete set live input vars and live output vars.
        while True:
-            for i in reversed(range(self.op_size)):
+            for i in reversed(list(range(self.op_size))):
                live_in[i] = set(self._live_in[i])
                live_out[i] = set(self._live_out[i])
                for s in self._successors[i]:
@@ -172,9 +174,10 @@ class ControlFlowGraph(object):
            is_forward = i < self._forward_num
            in_diff, out_diff = self._get_diff(self._live_in[i],
                                               self._live_out[i])
-            can_optimize = filter(
-                lambda x: self._check_var_validity(block_desc, x, is_forward),
-                in_diff)
+            can_optimize = [
+                x for x in in_diff
+                if self._check_var_validity(block_desc, x, is_forward)
+            ]
            if can_optimize:
                index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1
                delete_op = block_desc._insert_op(index)
@@ -213,9 +216,10 @@ class ControlFlowGraph(object):
            block_desc = op.block()
            is_forward = i < self._forward_num
            if self.pool:
-                defs_can_optimize = filter(
-                    lambda x: self._check_var_validity(block_desc, x, is_forward),
-                    self._defs[i])
+                defs_can_optimize = [
+                    x for x in self._defs[i]
+                    if self._check_var_validity(block_desc, x, is_forward)
+                ]
                out_pair = [
                    (x, self._find_var(block_desc, x, is_forward).shape())
                    for x in defs_can_optimize
@@ -261,9 +265,10 @@ class ControlFlowGraph(object):
                        break

            in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
-            can_optimize = filter(
-                lambda x: self._check_var_validity(block_desc, x, is_forward),
-                in_diff)
+            can_optimize = [
+                x for x in in_diff
+                if self._check_var_validity(block_desc, x, is_forward)
+            ]
            if can_optimize:
                for var_name in can_optimize:
                    self.pool.append((var_name, self._find_var(

--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -14,6 +14,7 @@

 import collections
 import contextlib
+import six
 import sys

 __all__ = ['generate', 'switch', 'guard']
@@ -67,8 +68,10 @@ def switch(new_generator=None):

 @contextlib.contextmanager
 def guard(new_generator=None):
-    if isinstance(new_generator, basestring):
+    if isinstance(new_generator, six.string_types):
        new_generator = UniqueNameGenerator(new_generator)
+    elif isinstance(new_generator, six.binary_type):
+        new_generator = UniqueNameGenerator(new_generator.decode())
    old = switch(new_generator)
    yield
    switch(old)
--- a/python/paddle/reader/creator.py
+++ b/python/paddle/reader/creator.py
@@ -67,11 +67,14 @@ def recordio(paths, buf_size=100):

    import recordio as rec
    import paddle.reader.decorator as dec
-    import cPickle as pickle
+    import six
+    import six.moves.cPickle as pickle

    def reader():
-        if isinstance(paths, basestring):
+        if isinstance(paths, six.string_types):
            path = paths
+        elif isinstance(paths, six.binary_type):
+            path = paths.decode()
        else:
            path = ",".join(paths)
        f = rec.reader(path)

--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -21,6 +21,9 @@ from threading import Thread
 import subprocess

 from six.moves.queue import Queue
+from six.moves import zip_longest
+from six.moves import map
+from six.moves import zip
 import itertools
 import random
 import zlib
@@ -42,7 +45,7 @@ def map_readers(func, *readers):
        rs = []
        for r in readers:
            rs.append(r())
-        for e in itertools.imap(func, *rs):
+        for e in map(func, *rs):
            yield e

    return reader
@@ -148,16 +151,16 @@ def compose(*readers, **kwargs):
        for r in readers:
            rs.append(r())
        if not check_alignment:
-            for outputs in itertools.izip(*rs):
-                yield sum(map(make_tuple, outputs), ())
+            for outputs in zip(*rs):
+                yield sum(list(map(make_tuple, outputs)), ())
        else:
-            for outputs in itertools.izip_longest(*rs):
+            for outputs in zip_longest(*rs):
                for o in outputs:
                    if o is None:
                        # None will be not be present if compose is aligned
                        raise ComposeNotAligned(
                            "outputs of readers are not aligned.")
-                yield sum(map(make_tuple, outputs), ())
+                yield sum(list(map(make_tuple, outputs)), ())

    return reader

@@ -306,7 +309,7 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
        args = (in_queue, out_queue, mapper, out_order) if order else (
            in_queue, out_queue, mapper)
        workers = []
-        for i in xrange(process_num):
+        for i in range(process_num):
            worker = Thread(target=target, args=args)
            worker.daemon = True
            workers.append(worker)

--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -136,7 +136,7 @@ class TestXmap(unittest.TestCase):
                    reader = paddle.reader.xmap_readers(mapper,
                                                        reader_creator_10(0),
                                                        tNum, size, order)
-                    for n in xrange(3):
+                    for n in range(3):
                        result = []
                        for i in reader():
                            result.append(i)
@@ -156,7 +156,7 @@ class TestPipeReader(unittest.TestCase):

        import tempfile

-        records = [str(i) for i in xrange(5)]
+        records = [str(i) for i in range(5)]
        temp = tempfile.NamedTemporaryFile()
        try:
            with open(temp.name, 'w') as f:

--- a/python/paddle/trainer/PyDataProviderWrapper.py
+++ b/python/paddle/trainer/PyDataProviderWrapper.py
@@ -42,7 +42,7 @@ except ImportError:
 try:
    import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle

 import io


--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -20,7 +20,7 @@ from .utils import deprecated
 try:
    import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle

 __all__ = ['define_py_data_sources2']


--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -28,7 +28,7 @@ from .default_decorators import *
 try:
    import cPickle as pickle
 except ImportError:
-    import pickle
+    import six.moves.cPickle as pickle
 import copy

 __all__ = [

--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -12,19 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
 import unittest
 import os
 import sys
 import paddle.fluid as fluid
 import importlib
-import cStringIO
+from six.moves import cStringIO


 def main():
    sys.path.append(os.getcwd())
    some_test_failed = False
    for module_name in sys.argv[1:]:
-        buffer = cStringIO.StringIO()
+        buffer = cStringIO()
        main = fluid.Program()
        startup = fluid.Program()
        scope = fluid.core.Scope()
@@ -37,8 +38,11 @@ def main():
                    res = unittest.TextTestRunner(stream=buffer).run(tests)
                    if not res.wasSuccessful():
                        some_test_failed = True
-                        print >> sys.stderr, module_name, 'failed\n', buffer.getvalue(
-                        )
+                        print(
+                            module_name,
+                            'failed\n',
+                            buffer.getvalue(),
+                            file=sys.stderr)

    if some_test_failed:
        exit(1)