Merge pull request #12390 from velconia/port_python3_syntax

Apply 2to3 to current paddle main python code

Merge pull request #12390 from velconia/port_python3_syntax
Apply 2to3 to current paddle main python code
29fac3c0 · Qiyang Min · GitHub · 5a9ae411 · 39059eb7 · 29fac3c0
170 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,7 @@ option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VER
 if(NOT PY_VERSION)
  set(PY_VERSION 2.7)
 endif()
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -394,8 +394,10 @@ All parameter, weight, gradient are variables in Paddle.
    InferenceOptimize(*(origin.Proto()), &pruned_desc);
    return new ProgramDesc(pruned_desc);
  });
-  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
+  m.def("empty_var_name",
-  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
+        []() { return std::string(framework::kEmptyVarName); });
+  m.def("grad_var_suffix",
+        []() { return std::string(framework::kGradVarSuffix); });
  m.def_submodule(
       "var_names",
       "The module will return special predefined variable name in Paddle")

--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -28,11 +28,12 @@ images per class.
 """
-import cPickle
 import itertools
 import numpy
 import paddle.dataset.common
 import tarfile
+from six.moves import zip
+from six.moves import cPickle as pickle
 __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
@@ -48,7 +49,7 @@ def reader_creator(filename, sub_name, cycle=False):
        data = batch['data']
        labels = batch.get('labels', batch.get('fine_labels', None))
        assert labels is not None
-        for sample, label in itertools.izip(data, labels):
+        for sample, label in zip(data, labels):
            yield (sample / 255.0).astype(numpy.float32), int(label)
    def reader():
@@ -58,7 +59,7 @@ def reader_creator(filename, sub_name, cycle=False):
            while True:
                for name in names:
-                    batch = cPickle.load(f.extractfile(name))
+                    batch = pickle.load(f.extractfile(name))
                    for item in read_batch(batch):
                        yield item
                if not cycle:

--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -20,9 +20,8 @@ import shutil
 import sys
 import importlib
 import paddle.dataset
-import cPickle
+import six.moves.cPickle as pickle
 import glob
-import cPickle as pickle
 __all__ = [
    'DATA_HOME',
@@ -75,13 +74,13 @@ def download(url, module_name, md5sum, save_name=None):
    retry_limit = 3
    while not (os.path.exists(filename) and md5file(filename) == md5sum):
        if os.path.exists(filename):
-            print "file md5", md5file(filename), md5sum
+            print("file md5", md5file(filename), md5sum)
        if retry < retry_limit:
            retry += 1
        else:
            raise RuntimeError("Cannot download {0} within retry limit {1}".
                               format(url, retry_limit))
-        print "Cache file %s not found, downloading %s" % (filename, url)
+        print("Cache file %s not found, downloading %s" % (filename, url))
        r = requests.get(url, stream=True)
        total_length = r.headers.get('content-length')
@@ -104,8 +103,9 @@ def download(url, module_name, md5sum, save_name=None):
 def fetch_all():
-    for module_name in filter(lambda x: not x.startswith("__"),
+    for module_name in [
-                              dir(paddle.dataset)):
+            x for x in dir(paddle.dataset) if not x.startswith("__")
+    ]:
        if "fetch" in dir(
                importlib.import_module("paddle.dataset.%s" % module_name)):
            getattr(
@@ -114,8 +114,9 @@ def fetch_all():
 def fetch_all_recordio(path):
-    for module_name in filter(lambda x: not x.startswith("__"),
+    for module_name in [
-                              dir(paddle.dataset)):
+            x for x in dir(paddle.dataset) if not x.startswith("__")
+    ]:
        if "convert" in dir(
                importlib.import_module("paddle.dataset.%s" % module_name)) and \
                not module_name == "common":
@@ -126,7 +127,7 @@ def fetch_all_recordio(path):
                "convert")(ds_path)
-def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+def split(reader, line_count, suffix="%05d.pickle", dumper=pickle.dump):
    """
    you can call the function as:
@@ -167,7 +168,7 @@ def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
 def cluster_files_reader(files_pattern,
                         trainer_count,
                         trainer_id,
-                         loader=cPickle.load):
+                         loader=pickle.load):
    """
    Create a reader that yield element from the given files, select
    a file set according trainer count and trainer_id
@@ -188,7 +189,7 @@ def cluster_files_reader(files_pattern,
        my_file_list = []
        for idx, fn in enumerate(file_list):
            if idx % trainer_count == trainer_id:
-                print "append file: %s" % fn
+                print("append file: %s" % fn)
                my_file_list.append(fn)
        for fn in my_file_list:
            with open(fn, "r") as f:
@@ -221,7 +222,7 @@ def convert(output_path, reader, line_count, name_prefix):
        for l in lines:
            # FIXME(Yancey1989):
            # dumps with protocol: pickle.HIGHEST_PROTOCOL
-            writer.write(cPickle.dumps(l))
+            writer.write(pickle.dumps(l))
        writer.close()
    lines = []

--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -24,6 +24,7 @@ import tarfile
 import gzip
 import itertools
 import paddle.dataset.common
+from six.moves import zip
 __all__ = ['test, get_dict', 'get_embedding', 'convert']
@@ -87,12 +88,12 @@ def corpus_reader(data_path, words_name, props_name):
            sentences = []
            labels = []
            one_seg = []
-            for word, label in itertools.izip(words_file, props_file):
+            for word, label in zip(words_file, props_file):
                word = word.strip()
                label = label.strip().split()
                if len(label) == 0:  # end of sentence
-                    for i in xrange(len(one_seg[0])):
+                    for i in range(len(one_seg[0])):
                        a_kind_lable = [x[i] for x in one_seg]
                        labels.append(a_kind_lable)

--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -28,10 +28,9 @@ Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
 """
-import cPickle
 import itertools
 import functools
-from common import download
+from .common import download
 import tarfile
 import scipy.io as scio
 from paddle.dataset.image import *
@@ -39,6 +38,8 @@ from paddle.reader import *
 import os
 import numpy as np
 from multiprocessing import cpu_count
+from six.moves import cPickle as pickle
+from six.moves import zip
 __all__ = ['train', 'test', 'valid']
 DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
@@ -116,10 +117,10 @@ def reader_creator(data_file,
                file = file.strip()
                batch = None
                with open(file, 'r') as f:
-                    batch = cPickle.load(f)
+                    batch = pickle.load(f)
                data = batch['data']
                labels = batch['label']
-                for sample, label in itertools.izip(data, batch['label']):
+                for sample, label in zip(data, batch['label']):
                    yield sample, int(label) - 1
            if not cycle:
                break

--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -36,7 +36,7 @@ except ImportError:
    cv2 = None
 import os
 import tarfile
-import cPickle
+import six.moves.cPickle as pickle
 __all__ = [
    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
@@ -86,10 +86,10 @@ def batch_images_from_tar(data_file,
                output = {}
                output['label'] = labels
                output['data'] = data
-                cPickle.dump(
+                pickle.dump(
                    output,
                    open('%s/batch_%d' % (out_path, file_id), 'w'),
-                    protocol=cPickle.HIGHEST_PROTOCOL)
+                    protocol=pickle.HIGHEST_PROTOCOL)
                file_id += 1
                data = []
                labels = []
@@ -97,10 +97,10 @@ def batch_images_from_tar(data_file,
        output = {}
        output['label'] = labels
        output['data'] = data
-        cPickle.dump(
+        pickle.dump(
            output,
            open('%s/batch_%d' % (out_path, file_id), 'w'),
-            protocol=cPickle.HIGHEST_PROTOCOL)
+            protocol=pickle.HIGHEST_PROTOCOL)
    with open(meta_file, 'a') as meta:
        for file in os.listdir(out_path):

--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -42,13 +42,13 @@ def tokenize(pattern):
        # sequential access of member files, other than
        # tarfile.extractfile, which does random access and might
        # destroy hard disks.
-        tf = tarf.next()
+        tf = next(tarf)
        while tf != None:
            if bool(pattern.match(tf.name)):
                # newline and punctuations removal and ad-hoc tokenization.
                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
                    None, string.punctuation).lower().split()
-            tf = tarf.next()
+            tf = next(tarf)
 def build_dict(pattern, cutoff):
@@ -62,11 +62,11 @@ def build_dict(pattern, cutoff):
            word_freq[word] += 1
    # Not sure if we should prune less-frequent words here.
-    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+    word_freq = [x for x in list(word_freq.items()) if x[1] > cutoff]
    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*dictionary))
-    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx = dict(list(zip(words, list(range(len(words))))))
    word_idx['<unk>'] = len(words)
    return word_idx

--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -64,11 +64,11 @@ def build_dict(min_word_freq=50):
            # remove <unk> for now, since we will set it as last index
            del word_freq['<unk>']
-        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+        word_freq = [x for x in list(word_freq.items()) if x[1] > min_word_freq]
        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
        words, _ = list(zip(*word_freq_sorted))
-        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx = dict(list(zip(words, list(range(len(words))))))
        word_idx['<unk>'] = len(words)
    return word_idx

--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -65,7 +65,7 @@ def reader_creator(image_filename, label_filename, buffer_size):
                images = images / 255.0 * 2.0 - 1.0
-                for i in xrange(buffer_size):
+                for i in range(buffer_size):
                    yield images[i, :], int(labels[i])
        finally:
            try:

--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -16,7 +16,7 @@ Movielens 1-M dataset.
 Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
 movies, which was collected by GroupLens Research. This module will download
-Movielens 1-M dataset from 
+Movielens 1-M dataset from
 http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
 set and test set into paddle reader creators.
@@ -187,7 +187,7 @@ def max_movie_id():
    Get the maximum value of movie id.
    """
    __initialize_meta_info__()
-    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+    return reduce(__max_index_info__, list(MOVIE_INFO.values())).index
 def max_user_id():
@@ -195,7 +195,7 @@ def max_user_id():
    Get the maximum value of user id.
    """
    __initialize_meta_info__()
-    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+    return reduce(__max_index_info__, list(USER_INFO.values())).index
 def __max_job_id_impl__(a, b):
@@ -210,7 +210,7 @@ def max_job_id():
    Get the maximum value of job id.
    """
    __initialize_meta_info__()
-    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+    return reduce(__max_job_id_impl__, list(USER_INFO.values())).job_id
 def movie_categories():
@@ -243,7 +243,7 @@ def unittest():
    for test_count, _ in enumerate(test()()):
        pass
-    print train_count, test_count
+    print(train_count, test_count)
 def fetch():

--- a/python/paddle/dataset/mq2007.py
+++ b/python/paddle/dataset/mq2007.py
@@ -26,7 +26,7 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
 import os
 import functools
 import rarfile
-from common import download
+from .common import download
 import numpy as np
 # URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
@@ -53,7 +53,7 @@ class Query(object):
  ----------
  query_id : int
    query_id in dataset, mapping from query to relevance documents
-  relevance_score : int 
+  relevance_score : int
    relevance score of query and document pair
  feature_vector : array, dense feature
    feature in vector format
@@ -92,7 +92,7 @@ class Query(object):
            sys.stdout.write("expect 48 space split parts, get %d" %
                             (len(parts)))
            return None
-        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        # format : 0 qid:10 1:0.000272 2:0.000000 ....
        self.relevance_score = int(parts[0])
        self.query_id = int(parts[1].split(':')[1])
        for p in parts[2:]:
@@ -295,7 +295,7 @@ def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
  --------
  filename : string
  fill_missing : fill the missing value. default in MQ2007 is -1
  Returns
  ------
  yield
@@ -330,4 +330,4 @@ if __name__ == "__main__":
    mytest = functools.partial(
        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
    for label, query in mytest():
-        print label, query
+        print(label, query)
--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -43,11 +43,11 @@ def download_data_if_not_yet():
            nltk.data.path.append(paddle.dataset.common.DATA_HOME)
        movie_reviews.categories()
    except LookupError:
-        print "Downloading movie_reviews data set, please wait....."
+        print("Downloading movie_reviews data set, please wait.....")
        nltk.download(
            'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
-        print "Download data set success....."
+        print("Download data set success.....")
-        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+        print("Path is " + nltk.data.find('corpora/movie_reviews').path)
 def get_word_dict():
@@ -64,7 +64,7 @@ def get_word_dict():
        for field in movie_reviews.fileids(category):
            for words in movie_reviews.words(field):
                word_freq_dict[words] += 1
-    words_sort_list = word_freq_dict.items()
+    words_sort_list = list(word_freq_dict.items())
    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
    for index, word in enumerate(words_sort_list):
        words_freq_sorted.append((word[0], index))
@@ -80,7 +80,8 @@ def sort_files():
    files_list = list()
    neg_file_list = movie_reviews.fileids('neg')
    pos_file_list = movie_reviews.fileids('pos')
-    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    files_list = list(
+        chain.from_iterable(list(zip(neg_file_list, pos_file_list))))
    return files_list

--- a/python/paddle/dataset/tests/common_test.py
+++ b/python/paddle/dataset/tests/common_test.py
@@ -36,7 +36,7 @@ class TestCommon(unittest.TestCase):
    def test_split(self):
        def test_reader():
            def reader():
-                for x in xrange(10):
+                for x in range(10):
                    yield x
            return reader
@@ -49,7 +49,7 @@ class TestCommon(unittest.TestCase):
    def test_cluster_file_reader(self):
        _, temp_path = tempfile.mkstemp()
-        for x in xrange(5):
+        for x in range(5):
            with open(temp_path + '/%05d.test' % x) as f:
                f.write('%d\n' % x)
        reader = paddle.dataset.common.cluster_files_reader(
@@ -63,7 +63,7 @@ class TestCommon(unittest.TestCase):
        def test_reader():
            def reader():
-                for x in xrange(record_num):
+                for x in range(record_num):
                    yield x
            return reader

--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -59,7 +59,7 @@ class TestMikolov(unittest.TestCase):
        self.assertEqual(first_line, read_line)
    def test_total(self):
-        _, idx = zip(*WORD_DICT.items())
+        _, idx = list(zip(*list(WORD_DICT.items())))
        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)

--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -24,9 +24,8 @@ from nltk.corpus import movie_reviews
 class TestSentimentMethods(unittest.TestCase):
    def test_get_word_dict(self):
        word_dict = st.get_word_dict()[0:10]
-        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
+        test_word_list = [(',', 0), ('the', 1), ('.', 2), ('a', 3), ('and', 4),
-                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
+                          ('of', 5), ('to', 6), ("'", 7), ('is', 8), ('in', 9)]
-                          (u'is', 8), (u'in', 9)]
        for idx, each in enumerate(word_dict):
            self.assertEqual(each, test_word_list[idx])
        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)

--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -49,9 +49,12 @@ def feature_range(maximums, minimums):
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots()
    feature_num = len(maximums)
-    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.bar(list(range(feature_num)),
+           maximums - minimums,
+           color='r',
+           align='center')
    ax.set_title('feature scale')
-    plt.xticks(range(feature_num), feature_names)
+    plt.xticks(list(range(feature_num)), feature_names)
    plt.xlim([-1, feature_num])
    fig.set_figheight(6)
    fig.set_figwidth(10)
@@ -71,7 +74,7 @@ def load_data(filename, feature_num=14, ratio=0.8):
    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
        axis=0) / data.shape[0]
    feature_range(maximums[:-1], minimums[:-1])
-    for i in xrange(feature_num - 1):
+    for i in range(feature_num - 1):
        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
    offset = int(data.shape[0] * ratio)
    UCI_TRAIN_DATA = data[:offset]

--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -154,8 +154,8 @@ def get_dict(dict_size, reverse=True):
    tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
    if reverse:
-        src_dict = {v: k for k, v in src_dict.items()}
+        src_dict = {v: k for k, v in list(src_dict.items())}
-        trg_dict = {v: k for k, v in trg_dict.items()}
+        trg_dict = {v: k for k, v in list(trg_dict.items())}
    return src_dict, trg_dict

--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -70,7 +70,9 @@ def __build_dict(tar_file, dict_size, save_path, lang):
        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
        for idx, word in enumerate(
                sorted(
-                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
+                    iter(list(word_dict.items())),
+                    key=lambda x: x[1],
+                    reverse=True)):
            if idx + 3 == dict_size: break
            fout.write("%s\n" % (word[0]))

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -14,49 +14,49 @@
 from __future__ import print_function
 # import all class inside framework into fluid module
-import framework
+from . import framework
-from framework import *
+from .framework import *
 # import all class inside executor into fluid module
-import executor
+from . import executor
-from executor import *
+from .executor import *
-import trainer
+from . import trainer
-from trainer import Trainer
+from .trainer import Trainer
-from trainer import BeginEpochEvent
+from .trainer import BeginEpochEvent
-from trainer import EndEpochEvent
+from .trainer import EndEpochEvent
-from trainer import BeginStepEvent
+from .trainer import BeginStepEvent
-from trainer import EndStepEvent
+from .trainer import EndStepEvent
-from trainer import CheckpointConfig
+from .trainer import CheckpointConfig
-import inferencer
+from . import inferencer
-from inferencer import Inferencer
+from .inferencer import Inferencer
-import io
+from . import io
-import evaluator
+from . import evaluator
-import initializer
+from . import initializer
-import layers
+from . import layers
-import contrib
+from . import contrib
-import nets
+from . import nets
-import optimizer
+from . import optimizer
-import backward
+from . import backward
-import regularizer
+from . import regularizer
-import average
+from . import average
-import metrics
+from . import metrics
-import transpiler
+from . import transpiler
-from param_attr import ParamAttr, WeightNormParamAttr
+from .param_attr import ParamAttr, WeightNormParamAttr
-from data_feeder import DataFeeder
+from .data_feeder import DataFeeder
-from core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
+from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
-from transpiler import DistributeTranspiler, InferenceTranspiler, \
+from .transpiler import DistributeTranspiler, InferenceTranspiler, \
    memory_optimize, release_memory, DistributeTranspilerConfig
-from concurrency import (Go, make_channel, channel_send, channel_recv,
+from .concurrency import (Go, make_channel, channel_send, channel_recv,
-                         channel_close, Select)
+                          channel_close, Select)
-from lod_tensor import create_lod_tensor, create_random_int_lodtensor
+from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
-import clip
+from . import clip
-import profiler
+from . import profiler
-import unique_name
+from . import unique_name
-import recordio_writer
+from . import recordio_writer
-import parallel_executor
+from . import parallel_executor
-from parallel_executor import *
+from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 Tensor = LoDTensor
@@ -99,8 +99,8 @@ def __bootstrap__():
        None
    """
    import sys
-    import core
    import os
+    from . import core
    in_test = 'unittest' in sys.modules

--- a/python/paddle/fluid/annotations.py
+++ b/python/paddle/fluid/annotations.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import functools
 import sys
@@ -28,7 +29,7 @@ def deprecated(since, instead, extra_message=""):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
-            print >> sys.stderr, err_msg
+            print(err_msg, file=sys.stderr)
            return func(*args, **kwargs)
        wrapper.__doc__ += "\n    "

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -16,7 +16,8 @@ from paddle.fluid import framework as framework
 from . import core
 import collections
 import copy
-import unique_name
+import six
+from . import unique_name
 __all__ = ['append_backward']
@@ -44,17 +45,25 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
    """
    op_desc = core.OpDesc()
    op_desc.set_type(op_type)
-    for para, args in inputs.iteritems():
+    for para, args in list(inputs.items()):
-        op_desc.set_input(para, args)
+        op_desc.set_input(
-    for para, args in outputs.iteritems():
+            para,
-        op_desc.set_output(para, args)
+            list(
+                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
+                    args)))
+    for para, args in list(outputs.items()):
+        op_desc.set_output(
+            para,
+            list(
+                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
+                    args)))
    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
    if op_role_attr_name not in attrs:
        attrs[
            op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
-    for name, val in attrs.iteritems():
+    for name, val in list(attrs.items()):
        if isinstance(val, framework.Block):
            op_desc.set_block_attr(name, val.desc)
        else:
@@ -105,7 +114,9 @@ def _strip_grad_suffix_(name):
    e.g. x@GRAD ==> x
         y@GRAD@RENAME@1 ==> y
    """
-    pos = name.find(core.grad_var_suffix())
+    if isinstance(name, six.text_type):
+        name = name.encode()
+    pos = name.find(six.b(core.grad_var_suffix()))
    return name[:pos] if pos != -1 else name
@@ -114,7 +125,9 @@ def _append_grad_suffix_(name):
    Append grad suffix to the given variable name
    e.g. x ==> x@GRAD
    """
-    return name + core.grad_var_suffix()
+    if isinstance(name, six.text_type):
+        name = name.encode()
+    return name + six.b(core.grad_var_suffix())
 def _addup_repetitive_outputs_(op_descs):
@@ -174,7 +187,7 @@ def _addup_repetitive_outputs_(op_descs):
                    op_desc.set_output(param_name, arg_names)
                    renamed_vars[var_name].append(new_name)
-    for var_name, inputs in renamed_vars.iteritems():
+    for var_name, inputs in list(renamed_vars.items()):
        if len(inputs) > 1:
            pending_sum_ops.append(
                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
@@ -198,16 +211,19 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
        out_arg_names = op_desc.output_arg_names()
        if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
            return True
-        if _all_in_set_(
+        if _all_in_set_([
-                filter(lambda name: name.find(core.grad_var_suffix()) != -1,
+                name for name in op_desc.input_arg_names()
-                       op_desc.input_arg_names()), no_grad_set):
+                if name.find(core.grad_var_suffix()) != -1
+        ], no_grad_set):
            no_grad_set.update(out_arg_names)
            return True
        return False
    # Remove ops whose outputs are all in no_grad_dict
-    op_descs = filter(
+    op_descs = [
-        lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
+        op_desc for op_desc in op_descs
+        if not _op_can_be_removed_(op_desc, no_grad_set)
+    ]
    # Insert fill_zeros_like_op
    to_insert = []
    for idx, op_desc in enumerate(op_descs):
@@ -217,12 +233,12 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
                    "X": [_strip_grad_suffix_(arg)]
                }, {"Out": [arg]}, {}), idx))
-    map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
+    list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
    return op_descs
-import proto.framework_pb2 as framework_pb2
+from .proto import framework_pb2
 def serialize_op_decs(op_desc):
@@ -244,8 +260,10 @@ def _callback_lookup_(op):
    if op.type == 'parallel_do' and op.attr('use_nccl'):
        all_vars = op.block.vars
        param_names = set(op.input('parameters'))
-        param_names = filter(lambda name: all_vars[name].stop_gradient is False,
+        param_names = [
-                             param_names)
+            name for name in param_names
+            if all_vars[name].stop_gradient is False
+        ]
        param_grad_names = [n + "@GRAD" for n in param_names]
        class ParallelDoCallBack(object):
@@ -399,7 +417,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
                continue
            block.desc.var(grad_var_name)
            new_vars.add(grad_var_name)
-            if not grad_to_var.has_key(grad_var_name):
+            if grad_var_name not in grad_to_var:
                continue
            grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
        # infer_shape and infer_type
@@ -427,7 +445,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
                op_desc.rename_output(name, new_name)
                var_map[name] = new_name
-    for g, ng in var_map.iteritems():
+    for g, ng in list(var_map.items()):
        if g in grad_to_var:
            grad_to_var[ng] = grad_to_var[g]
            grad_to_var.pop(g)
@@ -439,7 +457,7 @@ def _get_stop_gradients_(program):
    for block in program.blocks:
        assert isinstance(block, framework.Block)
        block_no_grad_set = set()
-        for var in block.vars.itervalues():
+        for var in list(block.vars.values()):
            assert isinstance(var, framework.Variable)
            if var.stop_gradient:
                block_no_grad_set.add(_append_grad_suffix_(var.name))
@@ -452,51 +470,51 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
    """
    Append backward part to main_program.
-    A complete neural network training is made up of forward and backward 
+    A complete neural network training is made up of forward and backward
-    propagation. However, when we configure a network, we only need to 
+    propagation. However, when we configure a network, we only need to
-    specify its forwrd part. The backward part is generated automatically 
+    specify its forwrd part. The backward part is generated automatically
    according to the forward part by this function.
-    In most cases, users do not need to invoke this function manually. It 
+    In most cases, users do not need to invoke this function manually. It
    will be automatically invoked by the optimizer's `minimize` function.
    Args:
        loss(Variable): The loss variable of the network.
-        parameter_list(list[string]|None): Names of parameters that need 
+        parameter_list(list[string]|None): Names of parameters that need
-                                           to be updated by optimizers. 
+                                           to be updated by optimizers.
-                                           If it is None, all parameters 
+                                           If it is None, all parameters
                                           will be updated.
                                           Default: None
-        no_grad_set(set|None): Variables in the Block 0 whose gradients 
+        no_grad_set(set|None): Variables in the Block 0 whose gradients
-                               should be ignored. All variables with 
+                               should be ignored. All variables with
-                               `step_gradient=True` from all blocks will 
+                               `step_gradient=True` from all blocks will
                               be automatically added into this set.
                               Default: None
-        callbacks(list[callable object]|None): The callbacks are used for 
+        callbacks(list[callable object]|None): The callbacks are used for
-                                               doing some custom jobs during 
+                                               doing some custom jobs during
-                                               backward part building. All 
+                                               backward part building. All
-                                               callable objects in it will 
+                                               callable objects in it will
-                                               be invoked once each time a 
+                                               be invoked once each time a
-                                               new gradient operator is added 
+                                               new gradient operator is added
-                                               into the program. The callable 
+                                               into the program. The callable
-                                               object must has two input 
+                                               object must has two input
-                                               parameters: 'block' and 'context'. 
+                                               parameters: 'block' and 'context'.
-                                               The 'block' is the block which 
+                                               The 'block' is the block which
-                                               the new gradient operator will 
+                                               the new gradient operator will
-                                               be added to. The 'context' is a 
+                                               be added to. The 'context' is a
-                                               map, whose keys are gradient 
+                                               map, whose keys are gradient
-                                               variable names and values are 
+                                               variable names and values are
                                               corresponding original variables.
-                                               In addition to this, the 'context' 
+                                               In addition to this, the 'context'
-                                               has another special key-value pair: 
+                                               has another special key-value pair:
-                                               the key is string '__current_op_desc__' 
+                                               the key is string '__current_op_desc__'
-                                               and the value is the op_desc of the 
+                                               and the value is the op_desc of the
-                                               gradient operator who has just 
+                                               gradient operator who has just
-                                               triggered the callable object. 
+                                               triggered the callable object.
    Returns:
-        list[(Variable,Variable)]: Pairs of parameter and its 
+        list[(Variable,Variable)]: Pairs of parameter and its
-        corresponding gradients. The key is the parameter and the 
+        corresponding gradients. The key is the parameter and the
        value is gradient variable.
    Raises:
@@ -535,7 +553,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
        no_grad_set = set()
    no_grad_set = copy.copy(no_grad_set)
    no_grad_dict = _get_stop_gradients_(program)
-    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
    grad_info_map = dict()
    root_block = program.block(0)
@@ -558,7 +576,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
    op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
-    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
    _append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
                          grad_to_var, callbacks)
@@ -697,7 +715,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
        no_grad_set = set()
    no_grad_set = copy.copy(no_grad_set)
    no_grad_dict = _get_stop_gradients_(prog)
-    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, no_grad_set)))
    fwd_op_num = block.desc.op_size()
@@ -731,7 +749,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
    op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
-    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
    grad_to_var = dict()
    grad_info_map = dict()
    _append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -13,10 +13,11 @@
 # limitations under the License.
 import copy
+import six
 import functools
-import layers
+from . import layers
-import framework
+from . import framework
 from . import core
 __all__ = [
@@ -80,8 +81,7 @@ def error_clip_callback(block, context):
    # the context is a grad_to_var map
    grad_to_var = context
    op_desc = block.desc.op(block.desc.op_size() - 1)
-    for grad_n in filter(lambda n: grad_to_var.has_key(n),
+    for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
-                         op_desc.output_arg_names()):
        fwd_var = block._var_recursive(grad_to_var[grad_n])
        error_clip = getattr(fwd_var, "error_clip", None)
        if not (error_clip is None or isinstance(error_clip,
@@ -247,8 +247,8 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
    """
    def __init__(self, clip_norm, group_name="default_group"):
-        if not isinstance(group_name, basestring):
+        if not isinstance(group_name, six.string_types):
-            raise TypeError("'group_name' must be a basestring.")
+            raise TypeError("'group_name' must be a %s." % (six.string_types))
        self.clip_norm = clip_norm
        self.group_name = group_name
@@ -284,7 +284,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
                x=clip_var,
                y=layers.elementwise_max(
                    x=clip_var, y=group_norm_var))
-            assert group_scale_var.shape == (1L, )
+            assert group_scale_var.shape == (1, )
            self.context[group_scale_name] = group_scale_var
        new_grad = layers.elementwise_mul(
@@ -313,7 +313,7 @@ def set_gradient_clip(clip, param_list=None, program=None):
        program = framework.default_main_program()
    if param_list is None:
        param_list = program.block(0).all_parameters()
-    if all(isinstance(elem, basestring) for elem in param_list):
+    if all(isinstance(elem, six.string_types) for elem in param_list):
        param_list = [program.block(0).var(elem) for elem in param_list]
    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
        raise TypeError(

--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from layers.control_flow import BlockGuard, equal
+from .layers.control_flow import BlockGuard, equal
 from .framework import Operator
-from layer_helper import LayerHelper, unique_name
+from .layer_helper import LayerHelper, unique_name
-from layers import fill_constant
+from .layers import fill_constant
-import core
+from . import core
 __all__ = [
    'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close',

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import decoder
+from . import decoder
-from decoder import *
+from .decoder import *
-import memory_usage_calc
+from . import memory_usage_calc
-from memory_usage_calc import *
+from .memory_usage_calc import *
 __all__ = decoder.__all__ + memory_usage_calc.__all__
--- a/python/paddle/fluid/contrib/decoder/__init__.py
+++ b/python/paddle/fluid/contrib/decoder/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import beam_search_decoder
+from . import beam_search_decoder
-from beam_search_decoder import *
+from .beam_search_decoder import *
 __all__ = beam_search_decoder.__all__
--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -22,6 +22,7 @@ This API is still under active development and may change drastically.
 import contextlib
 import numpy as np
+import six
 from ... import layers
 from ...framework import Variable
@@ -191,7 +192,7 @@ class StateCell(object):
        self._helper = LayerHelper('state_cell', name=name)
        self._cur_states = {}
        self._state_names = []
-        for state_name, state in states.items():
+        for state_name, state in six.iteritems(states):
            if not isinstance(state, InitState):
                raise ValueError('state must be an InitState object.')
            self._cur_states[state_name] = state
@@ -346,7 +347,7 @@ class StateCell(object):
        if self._in_decoder and not self._switched_decoder:
            self._switch_decoder()
-        for input_name, input_value in inputs.items():
+        for input_name, input_value in six.iteritems(inputs):
            if input_name not in self._inputs:
                raise ValueError('Unknown input %s. '
                                 'Please make sure %s in input '
@@ -361,7 +362,7 @@ class StateCell(object):
        if self._in_decoder and not self._switched_decoder:
            self._switched_decoder()
-        for state_name, decoder_state in self._states_holder.items():
+        for state_name, decoder_state in six.iteritems(self._states_holder):
            if id(self._cur_decoder_obj) not in decoder_state:
                raise ValueError('Unknown decoder object, please make sure '
                                 'switch_decoder been invoked.')
@@ -671,7 +672,7 @@ class BeamSearchDecoder(object):
            feed_dict = {}
            update_dict = {}
-            for init_var_name, init_var in self._input_var_dict.items():
+            for init_var_name, init_var in six.iteritems(self._input_var_dict):
                if init_var_name not in self.state_cell._inputs:
                    raise ValueError('Variable ' + init_var_name +
                                     ' not found in StateCell!\n')
@@ -721,7 +722,8 @@ class BeamSearchDecoder(object):
                    self.state_cell.update_states()
                    self.update_array(prev_ids, selected_ids)
                    self.update_array(prev_scores, selected_scores)
-                    for update_name, var_to_update in update_dict.items():
+                    for update_name, var_to_update in six.iteritems(
+                            update_dict):
                        self.update_array(var_to_update, feed_dict[update_name])
    def read_array(self, init, is_ids=False, is_scores=False):

--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
+from . import core
-import core
 import numpy
 import os
-import six.moves as six
+import six
+from six.moves import zip, range, xrange
 import multiprocessing
-from framework import Variable, default_main_program
+from .framework import Variable, default_main_program
 __all__ = ['DataFeeder']
@@ -53,7 +53,7 @@ class DataToLoDTensorConverter(object):
        self.data = []
        self.lod = []
-        for i in six.range(lod_level):
+        for i in six.moves.range(lod_level):
            self.lod.append([])
    def feed(self, data):
@@ -142,7 +142,7 @@ class DataFeeder(object):
        if program is None:
            program = default_main_program()
        for each_var in feed_list:
-            if isinstance(each_var, basestring):
+            if isinstance(each_var, six.string_types):
                each_var = program.block(0).var(each_var)
            if not isinstance(each_var, Variable):
                raise TypeError("Feed list should contain a list of variable")
@@ -174,7 +174,7 @@ class DataFeeder(object):
            dict: the result of conversion.
        """
        converter = []
-        for lod_level, shape, dtype in six.zip(
+        for lod_level, shape, dtype in six.moves.zip(
                self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
            converter.append(
                DataToLoDTensorConverter(
@@ -187,10 +187,12 @@ class DataFeeder(object):
            assert len(each_sample) == len(converter), (
                "The number of fields in data (%s) does not match " +
                "len(feed_list) (%s)") % (len(each_sample), len(converter))
-            for each_converter, each_slot in six.zip(converter, each_sample):
+            for each_converter, each_slot in six.moves.zip(converter,
+                                                           each_sample):
                each_converter.feed(each_slot)
        ret_dict = {}
-        for each_name, each_converter in six.zip(self.feed_names, converter):
+        for each_name, each_converter in six.moves.zip(self.feed_names,
+                                                       converter):
            ret_dict[each_name] = each_converter.done()
        return ret_dict
@@ -212,12 +214,14 @@ class DataFeeder(object):
        if isinstance(self.place, core.CUDAPlace):
            places = [
                core.CUDAPlace(i)
-                for i in six.xrange(self._get_number_of_places_(num_places))
+                for i in six.moves.xrange(
+                    self._get_number_of_places_(num_places))
            ]
        else:
            places = [
                core.CPUPlace()
-                for _ in six.xrange(self._get_number_of_places_(num_places))
+                for _ in six.moves.xrange(
+                    self._get_number_of_places_(num_places))
            ]
        if len(iterable) != len(places):
@@ -227,7 +231,7 @@ class DataFeeder(object):
                             "must be same.")
        place = self.place
-        for p, batch in six.zip(places, iterable):
+        for p, batch in six.moves.zip(places, iterable):
            self.place = p
            yield self.feed(batch)
        self.place = place

--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -14,8 +14,8 @@
 import sys
 import re
-from graphviz import GraphPreviewGenerator
+from .graphviz import GraphPreviewGenerator
-import proto.framework_pb2 as framework_pb2
+from .proto import framework_pb2
 from google.protobuf import text_format
 _vartype2str_ = [

--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -15,11 +15,11 @@
 import warnings
 import numpy as np
-import layers
+from . import layers
-from framework import Program, Variable, program_guard
+from .framework import Program, Variable, program_guard
-import unique_name
+from . import unique_name
-from layer_helper import LayerHelper
+from .layer_helper import LayerHelper
-from initializer import Constant
+from .initializer import Constant
 __all__ = [
    'ChunkEvaluator',

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -14,7 +14,8 @@
 import numpy as np
 import contextlib
-from framework import Program, default_main_program, Variable
+import six
+from .framework import Program, default_main_program, Variable
 from . import core
 __all__ = [
@@ -204,19 +205,19 @@ def fetch_var(name, scope=None, return_numpy=True):
 def _get_program_cache_key(feed, fetch_list):
-    feed_var_names = feed.keys()
+    feed_var_names = list(feed.keys())
    def to_name_str(var):
        if isinstance(var, Variable):
            return var.desc.name()
        elif isinstance(var, str):
            return var
-        elif isinstance(var, basestring):
+        elif isinstance(var, six.string_types):
            return str(var)
        else:
            raise TypeError(str(var) + " should be Variable or str")
-    fetch_var_names = map(to_name_str, fetch_list)
+    fetch_var_names = list(map(to_name_str, fetch_list))
    return str(feed_var_names + fetch_var_names)
@@ -229,8 +230,8 @@ class Executor(object):
    to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
    the variables(or names) that user want to get after program run. Note: the executor will run all
    operators in the program but not only the operators dependent by the fetch_list.
-    It store the global variables into the global scope, and create a local scope for the temporary 
+    It store the global variables into the global scope, and create a local scope for the temporary
-    variables. The local scope contents will be discarded after every minibatch forward/backward finished. 
+    variables. The local scope contents will be discarded after every minibatch forward/backward finished.
    But the global scope variables will be persistent through different runs.
    All of ops in program will be running in sequence.
@@ -345,7 +346,7 @@ class Executor(object):
    def _fetch_data(self, fetch_list, fetch_var_name, scope):
        outs = [
            core.get_fetch_variable(scope, fetch_var_name, i)
-            for i in xrange(len(fetch_list))
+            for i in range(len(fetch_list))
        ]
        return outs

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -15,21 +15,22 @@
 import collections
 import contextlib
 import re
+import six
 import numpy as np
-import proto.framework_pb2 as framework_pb2
+from .proto import framework_pb2
 try:
    from . import core
-except ImportError, e:
+except ImportError as e:
    raise ImportError(
        """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
    if you encounters \"libmkldnn.so not found\" errors. If you have python
    installed in other directory, replace \"/usr/local/lib\" with your own
    directory. The original error is: \n""" + e.message)
-except Exception, e:
+except Exception as e:
    raise e
-import unique_name
+from . import unique_name
 __all__ = [
    'Program',
@@ -86,7 +87,7 @@ def convert_np_dtype_to_dtype_(np_dtype):
    elif dtype == np.uint8:
        return core.VarDesc.VarType.UINT8
    else:
-        raise ValueError("Not supported numpy dtype " + str(dtype))
+        raise ValueError("Not supported numpy dtype " + six.binary_type(dtype))
 def dtype_is_floating(dtype):
@@ -129,15 +130,15 @@ def _debug_string_(proto, throw_on_error=True):
 class Variable(object):
    """
-    In Fluid, every input and output of an operator is a variable. In most 
+    In Fluid, every input and output of an operator is a variable. In most
-    cases, variables are used for holding different kinds of data or training 
+    cases, variables are used for holding different kinds of data or training
-    labels. A variable belongs to a block. All variable has its own name and 
+    labels. A variable belongs to a block. All variable has its own name and
    two variables in different blocks could have the same name.
-    There are many kinds of variables. Each kind of them has its own attributes 
+    There are many kinds of variables. Each kind of them has its own attributes
-    and usages. Please reference the framework.proto for details. 
+    and usages. Please reference the framework.proto for details.
-    Most of a Variable's member variables can be setted to be None. It mean 
+    Most of a Variable's member variables can be setted to be None. It mean
    it is not available or will be specified later.
    Args:
@@ -197,6 +198,7 @@ class Variable(object):
        if name is None:
            name = unique_name.generate('_generated_var')
        is_new_var = False
+        name = name if isinstance(name, six.binary_type) else name.encode()
        self.desc = self.block.desc.find_var(name)
        if self.desc is None:
@@ -290,13 +292,13 @@ class Variable(object):
        assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                               bool)
        protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.VarDesc.FromString(str(protostr))
+        proto = framework_pb2.VarDesc.FromString(six.binary_type(protostr))
        res_str = _debug_string_(proto, throw_on_error)
        if with_details:
            additional_attr = ("error_clip", "stop_gradient")
            for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
+                res_str += "%s: %s\n" % (
-                                         str(getattr(self, attr_name)))
+                    attr_name, six.binary_type(getattr(self, attr_name)))
        return res_str
    __repr__ = __str__
@@ -369,7 +371,7 @@ def get_all_op_protos():
    protostrs = core.get_all_op_protos()
    ret_values = []
    for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
        ret_values.append(op_proto)
    return ret_values
@@ -472,7 +474,6 @@ class Operator(object):
                 inputs=None,
                 outputs=None,
                 attrs=None):
        self.block = block
        self.desc = desc
        self.attrs = attrs
@@ -523,10 +524,19 @@ class Operator(object):
                            % (in_proto.name, len(in_args)))
                    in_arg_names = []
                    for arg in in_args:
-                        if isinstance(arg, basestring):
+                        if isinstance(arg, six.string_types):
                            in_arg_names.append(arg)
+                        elif isinstance(arg, six.binary_type):
+                            in_arg_names.append(arg.decode())
                        else:
-                            in_arg_names.append(arg.name)
+                            if isinstance(arg.name, six.string_types):
+                                in_arg_names.append(arg.name)
+                            elif isinstance(arg.name, six.binary_type):
+                                in_arg_names.append(arg.name.decode())
+                            else:
+                                raise TypeError(
+                                    "arguments require unicode, str or bytes, but get %s instead."
+                                    % (type(arg.name)))
                    self.desc.set_input(in_proto.name, in_arg_names)
                else:
                    self.desc.set_input(in_proto.name, [])
@@ -541,8 +551,9 @@ class Operator(object):
            if not given == need:
                raise ValueError(("Incorrect setting for output(s) of "
                                  "operator \"%s\". Need: [%s] Given: [%s]") %
-                                 (type, ", ".join(str(e) for e in need),
+                                 (type,
-                                  ", ".join(str(e) for e in given)))
+                                  ", ".join(six.binary_type(e) for e in need),
+                                  ", ".join(six.binary_type(e) for e in given)))
            for out_proto in proto.outputs:
                out_args = outputs[out_proto.name]
@@ -554,7 +565,14 @@ class Operator(object):
                        (out_proto.name, len(out_args)))
                out_arg_names = []
                for arg in out_args:
-                    out_arg_names.append(arg.name)
+                    if isinstance(arg.name, six.string_types):
+                        out_arg_names.append(arg.name)
+                    elif isinstance(arg.name, six.binary_type):
+                        out_arg_names.append(arg.name.decode())
+                    else:
+                        raise TypeError(
+                            "arguments require unicode, str or bytes, but get %s instead."
+                            % (type(arg.name)))
                    arg.op = self
                self.desc.set_output(out_proto.name, out_arg_names)
@@ -590,7 +608,7 @@ class Operator(object):
        """
        protostr = self.desc.serialize_to_string()
-        proto = framework_pb2.OpDesc.FromString(str(protostr))
+        proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
        return _debug_string_(proto, throw_on_error)
    def __str__(self):
@@ -845,7 +863,7 @@ class Block(object):
            re_add_indent = re.compile(r"\n(.)")
            res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
                self.idx, self.parent_idx)
-            for var in self.vars.itervalues():
+            for var in list(self.vars.values()):
                res_str += "\n  vars {\n    %s  }" % re_add_indent.sub(
                    r"\n    \1", var.to_string(throw_on_error, with_details))
            for op in self.ops:
@@ -854,7 +872,8 @@ class Block(object):
            res_str += "\n}"
        else:
            protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.BlockDesc.FromString(str(protostr))
+            proto = framework_pb2.BlockDesc.FromString(
+                six.binary_type(protostr))
            res_str = _debug_string_(proto, throw_on_error)
        return res_str
@@ -898,10 +917,11 @@ class Block(object):
        Returns:
            Variable: the Variable with the giving name.
        """
-        if not isinstance(name, basestring):
+        if not isinstance(name, six.string_types):
-            raise TypeError(
+            if not isinstance(name, six.binary_type):
-                "var require string as parameter, but get %s instead." %
+                raise TypeError(
-                (type(name)))
+                    "var require string as parameter, but get %s instead." %
+                    (type(name)))
        v = self.vars.get(name, None)
        if v is None:
            raise ValueError("var %s not in this block" % name)
@@ -949,10 +969,10 @@ class Block(object):
        raise ValueError("Var {0} is not found recursively".format(name))
    def all_parameters(self):
-        return list(self._iter_parameters())
+        return list(self.iter_parameters())
-    def _iter_parameters(self):
+    def iter_parameters(self):
-        return (item[1] for item in self.vars.iteritems()
+        return (item[1] for item in list(self.vars.items())
                if isinstance(item[1], Parameter))
    def create_var(self, *args, **kwargs):
@@ -1132,7 +1152,7 @@ class Block(object):
                self.create_var(name=var.name(), desc=var, type=var.type())
        # sync variables removed from c++ end
-        for var in self.vars.keys():
+        for var in list(self.vars.keys()):
            if not self.desc.find_var(var):
                self.vars.pop(var)
@@ -1204,7 +1224,7 @@ class Block(object):
        if not isinstance(other, Block):
            raise TypeError(
                "_copy_param_info_from should be invoked with Block")
-        for p in other._iter_parameters():
+        for p in other.iter_parameters():
            assert isinstance(p, Parameter)
            v = self.vars.get(p.name, None)
            if v is None:
@@ -1403,7 +1423,8 @@ class Program(object):
                res_str += block.to_string(throw_on_error, with_details)
        else:
            protostr = self.desc.serialize_to_string()
-            proto = framework_pb2.ProgramDesc.FromString(str(protostr))
+            proto = framework_pb2.ProgramDesc.FromString(
+                six.binary_type(protostr))
            res_str = _debug_string_(proto, throw_on_error)
        return res_str
@@ -1501,7 +1522,7 @@ class Program(object):
        else:
            p = Program()
            p.desc = core.ProgramDesc(self.desc)
-            p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+            p.blocks = [Block(p, i) for i in range(self.desc.num_blocks())]
            p._sync_with_cpp()
        p._copy_param_info_from(self)
@@ -1553,7 +1574,7 @@ class Program(object):
            targets_idx.append([t.block.idx, t.idx])
        res = Program()
        res.desc = core.prune(self.desc, targets_idx)
-        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
        res._sync_with_cpp()
        return res
@@ -1564,7 +1585,7 @@ class Program(object):
        2. Remove the :code:`read_op` if exists.
-        3. change the :code:`is_test` 
+        3. change the :code:`is_test`
        attribute of operators to :code:`True`. All the :code:`Parameter`
        information will be lost.
@@ -1594,13 +1615,13 @@ class Program(object):
                root_block._remove_var(var.name())
        # change all `is_test` attributes to True
-        for i in xrange(res.desc.num_blocks()):
+        for i in range(res.desc.num_blocks()):
            block = res.desc.block(i)
-            for j in xrange(block.op_size()):
+            for j in range(block.op_size()):
                op = block.op(j)
                if op.has_attr('is_test'):
                    op.set_attr('is_test', True)
-        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
        res._sync_with_cpp()
        return res
@@ -1613,14 +1634,14 @@ class Program(object):
        and deserialization.
        Args:
-            binary_str(str): The binary prootbuf string.
+            binary_str_type(str): The binary prootbuf string.
        Returns:
            Program: A deserialized program desc.
        """
        p = Program()
        p.desc = core.ProgramDesc(binary_str)
-        p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
+        p.blocks = [Block(p, i) for i in range(p.desc.num_blocks())]
        p._sync_with_cpp()
        return p
@@ -1648,7 +1669,7 @@ class Program(object):
        self._seed = seed
    def __repr__(self):
-        return str(self)
+        return self.__str__()
    def global_block(self):
        """
@@ -1759,7 +1780,7 @@ class Program(object):
        if len(self.blocks) != len(other.blocks):
            raise ValueError("_copy_param_info_from should be invoked with two "
                             "program, with represent the same topology")
-        for var in other.global_block().vars.itervalues():
+        for var in list(other.global_block().vars.values()):
            if var.is_data:
                self.global_block().var(var.name).is_data = True
@@ -1771,15 +1792,15 @@ class Program(object):
            iterable: The generator will yield every variable in this program.
        """
        for each_block in self.blocks:
-            for each_var in each_block.vars.itervalues():
+            for each_var in list(each_block.vars.values()):
                yield each_var
 class Parameter(Variable):
    """
-    Parameter is derived from Variable. A parameter is a persistable 
+    Parameter is derived from Variable. A parameter is a persistable
    Variable, and will be updated by optimizers after each iteration.
-    The training of a neural network is essentially the updating of 
+    The training of a neural network is essentially the updating of
    its parameters.
    Relative to a general Variable, a Parameter has several its own
@@ -1845,8 +1866,8 @@ class Parameter(Variable):
            additional_attr = ("trainable", "optimize_attr", "regularizer",
                               "gradient_clip_attr", "do_model_average")
            for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
+                res_str += "%s: %s\n" % (
-                                         str(getattr(self, attr_name)))
+                    attr_name, six.binary_type(getattr(self, attr_name)))
        else:
            res_str = Variable.to_string(self, throw_on_error, False)
        return res_str

--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -14,12 +14,13 @@
 import os
 import random
+import six
 import subprocess
 import logging
 def crepr(v):
-    if type(v) is str or type(v) is unicode:
+    if isinstance(v, six.string_types):
        return '"%s"' % v
    return str(v)
@@ -104,7 +105,7 @@ class Graph(object):
    def _rank_repr(self):
        ranks = sorted(
-            self.rank_groups.items(),
+            list(self.rank_groups.items()),
            cmp=lambda a, b: a[1].priority > b[1].priority)
        repr = []
        for x in ranks:
@@ -148,7 +149,7 @@ class Node(object):
            name=self.name,
            label=self.label,
            extra=',' + ','.join("%s=%s" % (key, crepr(value))
-                                 for key, value in self.attrs.items())
+                                 for key, value in list(self.attrs.items()))
            if self.attrs else "")
        return reprs
@@ -172,7 +173,7 @@ class Edge(object):
            target=self.target.name,
            extra="" if not self.attrs else
            "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
-                           for attr in self.attrs.items()) + "]")
+                           for attr in list(self.attrs.items())) + "]")
        return repr

--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -14,14 +14,14 @@
 import contextlib
-import core
+from . import core
-import executor
+from . import executor
-import framework
+from . import framework
-import io
+from . import io
-import parallel_executor
+from . import parallel_executor
-import unique_name
+from . import unique_name
-from trainer import check_and_get_place
+from .trainer import check_and_get_place
 __all__ = ['Inferencer', ]

--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import framework
+from . import framework
 import numpy as np
 import contextlib
-from framework import convert_np_dtype_to_dtype_
+from .framework import convert_np_dtype_to_dtype_
-from core import VarDesc
+from .core import VarDesc
 __all__ = [
    'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -14,12 +14,14 @@
 import copy
 import itertools
+import six
-from framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
+from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating
-import unique_name
+from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
-from param_attr import ParamAttr, WeightNormParamAttr
+from .param_attr import ParamAttr, WeightNormParamAttr
-import core
+from . import core
+from six.moves import zip
 class LayerHelper(object):
@@ -83,7 +85,7 @@ class LayerHelper(object):
            raise ValueError("parameter number mismatch")
        elif len(param_attr) == 1 and length != 1:
            tmp = [None] * length
-            for i in xrange(length):
+            for i in range(length):
                tmp[i] = copy.deepcopy(param_attr[0])
            param_attr = tmp
        return param_attr
@@ -91,7 +93,7 @@ class LayerHelper(object):
    def iter_inputs_and_params(self, input_param_name='input'):
        inputs = self.multiple_input(input_param_name)
        param_attrs = self.multiple_param_attr(len(inputs))
-        for ipt, param_attr in itertools.izip(inputs, param_attrs):
+        for ipt, param_attr in zip(inputs, param_attrs):
            yield ipt, param_attr
    def input_dtype(self, input_param_name='input'):
@@ -218,7 +220,7 @@ class LayerHelper(object):
                norm = __norm_op(reshape, dim=0, block=block)
                __reshape_op(norm, out=out, shape=out_shape, block=block)
            else:
-                perm = range(len(x.shape))
+                perm = list(range(len(x.shape)))
                perm[0], perm[dim] = dim, 0
                transpose = __transpose_op(x, perm, block=block)
                norm = __norm_op(transpose, dim=0, block=block)
@@ -397,8 +399,10 @@ class LayerHelper(object):
        act = self.kwargs.get('act', None)
        if act is None:
            return input_var
-        if isinstance(act, basestring):
+        if isinstance(act, six.string_types):
            act = {'type': act}
+        else:
+            raise TypeError(str(act) + " should be unicode or str")
        if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
            act['use_cudnn'] = self.kwargs.get('use_cudnn')

--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -12,25 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import ops
+from . import ops
-from ops import *
+from .ops import *
-import nn
+from . import nn
-from nn import *
+from .nn import *
-import io
+from . import io
-from io import *
+from .io import *
-import tensor
+from . import tensor
-from tensor import *
+from .tensor import *
-import control_flow
+from . import control_flow
-from control_flow import *
+from .control_flow import *
-import device
+from . import device
-from device import *
+from .device import *
-import math_op_patch
+from . import math_op_patch
-from math_op_patch import *
+from .math_op_patch import *
-import detection
+from . import detection
-from detection import *
+from .detection import *
-import metric_op
+from . import metric_op
-from metric_op import *
+from .metric_op import *
-from learning_rate_scheduler import *
+from .learning_rate_scheduler import *
 __all__ = []
 __all__ += nn.__all__

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,15 +13,16 @@
 # limitations under the License.
 import contextlib
-from layer_function_generator import autodoc, templatedoc
+from .layer_function_generator import autodoc, templatedoc
-from tensor import assign, fill_constant
+from .tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
 from ..initializer import force_init_on_cpu
-from ops import logical_and, logical_not, logical_or
+from .ops import logical_and, logical_not, logical_or
 import numpy
 import warnings
+from functools import reduce
 __all__ = [
    'While',
@@ -276,7 +277,7 @@ class ParallelDo(object):
          avg_cost = fluid.layers.mean(x=cost)
    .. warning::
       It will be soon deprecated, please use ParallelExecutor instead.
    """
@@ -601,7 +602,7 @@ class StaticRNN(object):
        boot_memories = []
        pre_memories = []
        memories = []
-        for _, mem in self.memories.iteritems():
+        for _, mem in list(self.memories.items()):
            boot_memories.append(mem.init)
            pre_memories.append(mem.pre_mem.name)
            mem_var = rnn_block.var(mem.mem.name)
@@ -819,21 +820,21 @@ def max_sequence_len(rank_table):
 def lod_tensor_to_array(x, table):
-    """ 
+    """
    Convert a LoDTensor to a LoDTensorArray.
-    This function split a LoDTesnor to a LoDTensorArray according to its LoD 
+    This function split a LoDTesnor to a LoDTensorArray according to its LoD
-    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in 
+    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in
-    PaddlePaddle. The generated LoDTensorArray of this function can be further read 
+    PaddlePaddle. The generated LoDTensorArray of this function can be further read
-    or written by `read_from_array()` and `write_to_array()` operators. However, 
+    or written by `read_from_array()` and `write_to_array()` operators. However,
-    this function is generally an internal component of PaddlePaddle `DynamicRNN`. 
+    this function is generally an internal component of PaddlePaddle `DynamicRNN`.
    Users should not use it directly.
    Args:
        x (Variable|list): The LoDTensor to be converted to a LoDTensorArray.
        table (ParamAttr|list): The variable that stores the level of lod
                                which is ordered by sequence length in
-                                descending order. It is generally generated 
+                                descending order. It is generally generated
                                by `layers.lod_rank_table()` API.
    Returns:
@@ -1067,9 +1068,9 @@ def array_read(array, i):
        Given:
        array = [0.6, 0.1, 0.3, 0.1]
        And:
        i = 2
        Then:
@@ -1176,9 +1177,9 @@ def array_length(array):
 class ConditionalBlockGuard(BlockGuard):
    """
-    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for 
+    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for
-    holding a ConditionalBlock, and helping users entering and exiting the 
+    holding a ConditionalBlock, and helping users entering and exiting the
-    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard 
+    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard
    is generally an internal component of IfElse, users should not use it directly.
    """
@@ -1512,7 +1513,7 @@ class IfElse(object):
    def __call__(self):
        if self.status != self.OUT_IF_ELSE_BLOCKS:
            raise ValueError("IfElse::__call__ must be out of sub-block")
-        false_len, true_len = map(len, self.output_table)
+        false_len, true_len = list(map(len, self.output_table))
        if false_len == 0 and true_len == 0:
            raise ValueError("Must invoke true_block/false_block before "
                             "__call__")
@@ -1932,7 +1933,7 @@ def is_empty(x, cond=None, **ignored):
    Args:
        x (Variable): The Variable to be tested.
-        cond (Variable|None): Output parameter. Returns the test result 
+        cond (Variable|None): Output parameter. Returns the test result
                              of given 'x'. Default: None
    Returns:

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -15,12 +15,13 @@
 All layers just related to the detection neural network.
 """
-from layer_function_generator import generate_layer_fn
+from .layer_function_generator import generate_layer_fn
-from layer_function_generator import autodoc, templatedoc
+from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
-import tensor
+from . import tensor
-import nn
+from . import nn
 import math
+from functools import reduce
 __all__ = [
    'prior_box',
@@ -1032,7 +1033,7 @@ def multi_box_head(inputs,
        min_sizes = []
        max_sizes = []
        step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
-        for ratio in xrange(min_ratio, max_ratio + 1, step):
+        for ratio in range(min_ratio, max_ratio + 1, step):
            min_sizes.append(base_size * ratio / 100.)
            max_sizes.append(base_size * (ratio + step) / 100.)
        min_sizes = [base_size * .10] + min_sizes

--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -15,7 +15,7 @@
 All util layers.
 """
-from layer_function_generator import autodoc
+from .layer_function_generator import autodoc
 from ..framework import unique_name
 from ..layer_helper import LayerHelper
 from ..annotations import deprecated

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -16,8 +16,8 @@ import multiprocessing
 import threading
 from ..data_feeder import DataFeeder
-from control_flow import BlockGuard
+from .control_flow import BlockGuard
-from layer_function_generator import templatedoc
+from .layer_function_generator import templatedoc
 from .. import core
 from ..executor import global_scope
 from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
@@ -69,7 +69,7 @@ def data(name,
    """
    helper = LayerHelper('data', **locals())
    shape = list(shape)
-    for i in xrange(len(shape)):
+    for i in range(len(shape)):
        if shape[i] is None:
            shape[i] = -1
            append_batch_size = False
@@ -387,9 +387,9 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
    Create a uniform random data generator
    This layer returns a Reader Variable.
-    Instead of opening a file and reading data from it, this 
+    Instead of opening a file and reading data from it, this
-    Reader Variable generates float uniform random data by itself. 
+    Reader Variable generates float uniform random data by itself.
-    It can be used as a dummy reader to test a network without 
+    It can be used as a dummy reader to test a network without
    opening a real file.
    Args:
@@ -707,9 +707,9 @@ def open_files(filenames,
    """
    Open files
-    This layer takes a list of files to read from and returns a Reader Variable. 
+    This layer takes a list of files to read from and returns a Reader Variable.
-    Via the Reader Variable, we can get data from given files. All files must 
+    Via the Reader Variable, we can get data from given files. All files must
-    have name suffixs to indicate their formats, e.g., '*.recordio'. 
+    have name suffixs to indicate their formats, e.g., '*.recordio'.
    Args:
       filenames(list): The list of file names.
@@ -825,9 +825,9 @@ def shuffle(reader, buffer_size):
 def batch(reader, batch_size):
    """
-    This layer is a reader decorator. It takes a reader and adds 
+    This layer is a reader decorator. It takes a reader and adds
-    'batching' decoration on it. When reading with the result 
+    'batching' decoration on it. When reading with the result
-    decorated reader, output data will be automatically organized 
+    decorated reader, output data will be automatically organized
    to the form of batches.
    Args:
@@ -852,11 +852,11 @@ def batch(reader, batch_size):
            # If we read data with the raw_reader:
            #     data = fluid.layers.read_file(raw_reader)
            # We can only get data instance by instance.
-            # 
+            #
            # However, if we read data with the batch_reader:
            #     data = fluid.layers.read_file(batch_reader)
-            # Each 5 adjacent instances will be automatically combined together 
+            # Each 5 adjacent instances will be automatically combined together
-            # to become a batch. So what we get('data') is a batch data instead 
+            # to become a batch. So what we get('data') is a batch data instead
            # of an instance.
    """
    return __create_unshared_decorated_reader__(
@@ -903,8 +903,8 @@ def read_file(reader):
    """
    Execute the given reader and get data via it.
-    A reader is also a Variable. It can be a raw reader generated by 
+    A reader is also a Variable. It can be a raw reader generated by
-    `fluid.layers.open_files()` or a decorated one generated by 
+    `fluid.layers.open_files()` or a decorated one generated by
    `fluid.layers.double_buffer()` and so on.
    Args:
@@ -1005,7 +1005,7 @@ class Preprocessor(object):
        source_lod_levels = self.underlying_reader.desc.lod_levels()
        self.source_var_names = [
            unique_name("preprocessor_source")
-            for _ in xrange(len(source_shapes))
+            for _ in range(len(source_shapes))
        ]
        source_vars = []
        for var_name, shape, dtype, lod_level in zip(

--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
-import cStringIO
 import functools
 import warnings
 import string
+from six.moves import cStringIO
 from ..proto import framework_pb2
 from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
@@ -70,7 +70,7 @@ def _generate_doc_string_(op_proto):
    if not isinstance(op_proto, framework_pb2.OpProto):
        raise TypeError("OpProto should be `framework_pb2.OpProto`")
-    buf = cStringIO.StringIO()
+    buf = cStringIO()
    buf.write(escape_math(op_proto.comment))
    buf.write('\nArgs:\n')
    for each_input in op_proto.inputs:
@@ -119,9 +119,9 @@ def generate_layer_fn(op_type):
    """
    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
    not_intermediate_outputs = \
-        filter(lambda output: not output.intermediate, op_proto.outputs)
+        [output for output in op_proto.outputs if not output.intermediate]
    intermediate_outputs = \
-        filter(lambda output: output.intermediate, op_proto.outputs)
+        [output for output in op_proto.outputs if output.intermediate]
    if len(not_intermediate_outputs) != 1:
        raise ValueError("Only one non intermediate output operator can be",

--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -20,10 +20,10 @@ User can also implement their own learning_rate_decay
 strategy according to this module.
 """
-import control_flow
+from . import control_flow
-import nn
+from . import nn
-import ops
+from . import ops
-import tensor
+from . import tensor
 from ..initializer import init_on_cpu
 from ..framework import default_main_program, Parameter

--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from ..framework import Variable, unique_name
-from layer_function_generator import OpProtoHolder
+from .layer_function_generator import OpProtoHolder
 from ..initializer import force_init_on_cpu

--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -20,7 +20,7 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-import nn
+from . import nn
 __all__ = ['accuracy', 'auc']

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -33,11 +33,12 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc, templatedoc
+from .layer_function_generator import autodoc, templatedoc
-from tensor import concat
+from .tensor import concat
-import utils
+from . import utils
 import random
 from .. import unique_name
+from functools import reduce
 __all__ = [
    'fc',
@@ -4849,7 +4850,7 @@ def dice_loss(input, label, epsilon=0.00001):
            loss = fluid.layers.dice_loss(input=predictions, label=label, 2)
    """
    label = one_hot(label, depth=input.shape[-1])
-    reduce_dim = range(1, len(input.shape))
+    reduce_dim = list(range(1, len(input.shape)))
    inse = reduce_sum(input * label, dim=reduce_dim)
    dice_denominator = reduce_sum(
        input, dim=reduce_dim) + reduce_sum(

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from layer_function_generator import generate_layer_fn
+from .layer_function_generator import generate_layer_fn
 __activations__ = [
    'sigmoid',

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -18,7 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
-from layer_function_generator import templatedoc
+from .layer_function_generator import templatedoc
 import numpy
 __all__ = [

--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import core
+from . import core
 import numpy as np
 __all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
@@ -24,7 +24,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
    Create a lod tensor by doing the following:
-    1. Check that the length-based level of detail (LoD) also known as 
+    1. Check that the length-based level of detail (LoD) also known as
       recursive_sequence_lengths of the input is valid.
    2. Convert recursive_sequence_lengths to a offset-based LoD.
@@ -33,7 +33,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
       CPU or GPU device (based on input place).
    4. Set the level of detail (LoD) using the offset-based LoD.
    Examples:
        Suppose we want LoDTensor to hold data for sequences of word, where each
@@ -51,7 +51,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
    Args:
        data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
            list holding the data to be copied.
-        recursive_seq_lens(list): a list of lists indicating the length-based level of detail 
+        recursive_seq_lens(list): a list of lists indicating the length-based level of detail
            info specified by the user.
        place(Place): CPU or GPU place indicating where the data in the new
            LoDTensor will be stored.
@@ -62,10 +62,10 @@ def create_lod_tensor(data, recursive_seq_lens, place):
    if isinstance(data, core.LoDTensor):
        return create_lod_tensor(np.array(data), recursive_seq_lens, place)
    elif isinstance(data, list):
-        # When input data is a list, it only deal with the case where the base element 
+        # When input data is a list, it only deal with the case where the base element
-        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
+        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated
-        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
+        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number
-        # of words or other indexes in the sequence. 
+        # of words or other indexes in the sequence.
        new_recursive_seq_lens = []
        for seq in data:
            new_recursive_seq_lens.append(len(seq))
@@ -109,12 +109,12 @@ def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
    Suppose we want LoDTensor to hold data for sequences of word, where each
    word is represented by an integer. If we want to create a LoDTensor to
    represent two sentences, one of 2 words, and one of 3 words. Then
-    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]]. 
+    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]].
-    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words 
+    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words
    for two sentences.
    Args:
-        recursive_seq_lens(list): a list of lists indicating the length-based 
+        recursive_seq_lens(list): a list of lists indicating the length-based
            level of detail info specified by the user.
        base_shape(list): the shape of the basic element to be held by the
            LoDTensor.
@@ -124,11 +124,11 @@ def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
        high(int): the upper bound of the random integers.
    Returns:
-        A fluid LoDTensor object with tensor data and recursive_seq_lens info. 
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
    """
    assert isinstance(base_shape, list), "base_shape should be a list"
    # append the total number of basic elements to the front of its shape
    overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
-    # the range of integer data elements is [low, high]    
+    # the range of integer data elements is [low, high]
    data = np.random.random_integers(low, high, overall_shape).astype("int64")
    return create_lod_tensor(data, recursive_seq_lens, place)
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -79,10 +79,10 @@ class MetricBase(object):
        """
        states = {
            attr: value
-            for attr, value in self.__dict__.iteritems()
+            for attr, value in list(self.__dict__.items())
            if not attr.startswith("_")
        }
-        for attr, value in states.iteritems():
+        for attr, value in list(states.items()):
            if isinstance(value, int):
                setattr(self, attr, 0)
            elif isinstance(value, float):
@@ -105,7 +105,7 @@ class MetricBase(object):
        """
        states = {
            attr: value
-            for attr, value in self.__dict__.iteritems()
+            for attr, value in list(self.__dict__.items())
            if not attr.startswith("_")
        }
        config = {}

--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -24,7 +24,7 @@ logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 try:
-    from graphviz import Digraph
+    from .graphviz import Digraph
 except ImportError:
    logger.info(
        'Cannot import graphviz, which is required for drawing a network. This '
@@ -77,7 +77,7 @@ def parse_graph(program, graph, var_dict, **kwargs):
    # fill the known variables
    for block in program.blocks:
        for var in block.vars:
-            if not var_dict.has_key(var):
+            if var not in var_dict:
                var_dict[var] = "Feed"
    temp_id = 0
@@ -93,17 +93,17 @@ def parse_graph(program, graph, var_dict, **kwargs):
                    var_dict[arg] = op.type
            for e in op.inputs:
                for arg in e.arguments:
-                    if var_dict.has_key(arg):
+                    if arg in var_dict:
                        graph.edge(**draw_edge(var_dict, op, e, arg))
        break  # only plot the first block
 def draw_graph(startup_program, main_program, **kwargs):
-    if kwargs.has_key("graph_attr"):
+    if "graph_attr" in kwargs:
        GRAPH_STYLE.update(kwargs[graph_attr])
-    if kwargs.has_key("node_attr"):
+    if "node_attr" in kwargs:
        OP_STYLE.update(kwargs[node_attr])
-    if kwargs.has_key("edge_attr"):
+    if "edge_attr" in kwargs:
        VAR_STYLE.update(kwargs[edge_attr])
    graph_id = unique_id()

--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import layers
+from . import layers
 __all__ = [
    "simple_img_conv_pool",
@@ -210,7 +210,7 @@ def img_conv_group(input,
    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
-    for i in xrange(len(conv_num_filter)):
+    for i in range(len(conv_num_filter)):
        local_conv_act = conv_act
        if conv_with_batchnorm[i]:
            local_conv_act = None
@@ -488,10 +488,11 @@ def scaled_dot_product_attention(queries,
        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
        return layers.reshape(
            x=trans_x,
-            shape=map(int, [
+            shape=list(
-                trans_x.shape[0], trans_x.shape[1],
+                map(int, [
-                trans_x.shape[2] * trans_x.shape[3]
+                    trans_x.shape[0], trans_x.shape[1], trans_x.shape[2] *
-            ]))
+                    trans_x.shape[3]
+                ])))
    q, k, v = __compute_qkv(queries, keys, values, num_heads)

--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import six
 import paddle.fluid.core as core
 import paddle.fluid.proto.framework_pb2 as framework_pb2
@@ -24,13 +26,13 @@ def get_all_op_protos():
    protostrs = core.get_all_op_protos()
    ret_values = []
    for pbstr in protostrs:
-        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        op_proto = framework_pb2.OpProto.FromString(six.binary_type(pbstr))
        ret_values.append(op_proto)
    return ret_values
 def is_str(s):
-    return isinstance(s, str) or isinstance(s, unicode)
+    return isinstance(s, six.string_types)
 class OpDescCreationMethod(object):
@@ -189,7 +191,7 @@ class OperatorFactory(object):
        return self.get_op_info(t).method(**kwargs)
    def types(self):
-        return self.op_methods.keys()
+        return list(self.op_methods.keys())
    def get_op_info(self, t):
        if t not in self.op_methods:
@@ -197,13 +199,13 @@ class OperatorFactory(object):
        return self.op_methods.get(t)
    def get_op_input_names(self, type):
-        return map(lambda x: x[0], self.get_op_info(type).inputs)
+        return [x[0] for x in self.get_op_info(type).inputs]
    def get_op_inputs(self, type):
        return self.get_op_info(type).inputs
    def get_op_output_names(self, type):
-        return map(lambda x: x[0], self.get_op_info(type).outputs)
+        return [x[0] for x in self.get_op_info(type).outputs]
    def get_op_outputs(self, type):
        return self.get_op_info(type).outputs

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -14,15 +14,15 @@
 import re
 from collections import defaultdict
 from paddle.fluid.framework import Program, Variable
-import framework
+from . import framework
-import layers
+from . import layers
-from backward import append_backward
+from .backward import append_backward
-from framework import program_guard
+from .framework import program_guard
-import unique_name
+from . import unique_name
-from initializer import Constant
+from .initializer import Constant
-from layer_helper import LayerHelper
+from .layer_helper import LayerHelper
-from regularizer import append_regularization_ops
+from .regularizer import append_regularization_ops
-from clip import append_gradient_clip_ops, error_clip_callback
+from .clip import append_gradient_clip_ops, error_clip_callback
 from contextlib import contextmanager
 __all__ = [

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import core
+from __future__ import print_function
 import multiprocessing
-import framework
+from . import core
-import executor
+from . import framework
+from . import executor
 import warnings
 import sys
 import os
@@ -94,7 +95,7 @@ class ParallelExecutor(object):
        self._places = []
        self._act_places = []
        if use_cuda:
-            for i in xrange(core.get_cuda_device_count()):
+            for i in range(core.get_cuda_device_count()):
                p = core.Place()
                self._act_places.append(core.CUDAPlace(i))
                p.set_place(self._act_places[-1])
@@ -102,7 +103,7 @@ class ParallelExecutor(object):
        else:
            cpu_num = int(
                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-            for i in xrange(cpu_num):
+            for i in range(cpu_num):
                p = core.Place()
                self._act_places.append(core.CPUPlace())
                p.set_place(self._act_places[-1])
@@ -143,16 +144,16 @@ class ParallelExecutor(object):
        ) if share_vars_from else []
        self.persistable_vars = [
-            v.name
+            v.name for v in [
-            for v in filter(
+                var for var in main.list_vars()
-                lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
+                if var.persistable and var.type != core.VarDesc.VarType.RAW
-                main.list_vars())
+            ]
        ]
        self.executor = core.ParallelExecutor(
            self._places,
            set([
-                p.name for p in main.global_block()._iter_parameters()
+                p.name for p in main.global_block().iter_parameters()
                if not p.stop_gradient
            ]),
            set(self.persistable_vars), main.desc, loss_name
@@ -227,7 +228,9 @@ class ParallelExecutor(object):
        """
        if feed is None and feed_dict is not None:
            feed = feed_dict
-            print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
+            print(
+                "`feed_dict` is deprecated. Please use `feed=`",
+                file=sys.stderr)
        if isinstance(feed, dict):
            feed_tensor_dict = dict()

--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -12,8 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from initializer import Initializer, Xavier, Constant
+import six
-from regularizer import WeightDecayRegularizer
+from .initializer import Initializer, Xavier, Constant
+from .regularizer import WeightDecayRegularizer
 __all__ = [
    'ParamAttr',
@@ -134,7 +136,7 @@ class ParamAttr(object):
            return [ParamAttr._to_attr(a) for a in arg]
        elif isinstance(arg, ParamAttr):
            return arg
-        elif isinstance(arg, str) or isinstance(arg, unicode):
+        elif isinstance(arg, six.string_types):
            return ParamAttr(name=arg)
        elif isinstance(arg, Initializer):
            return ParamAttr(initializer=arg)

--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import core
+from . import core
 from contextlib import contextmanager
 import os
@@ -224,7 +224,7 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
    If the state == 'All', a profile proto file will be written to
    `profile_path`. This file records timeline information during the execution.
-    Then users can visualize this file to see the timeline, please refer 
+    Then users can visualize this file to see the timeline, please refer
    https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
    Args:

--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 import os
-import core
 import contextlib
+from . import core
 __all__ = [
    'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
 ]

--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import framework
+from . import framework
 from . import core
 __all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']

--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -63,7 +63,7 @@ def train(use_cuda, train_program, params_dirname):
            if event.step == 10:
                test_metrics = trainer.test(
                    reader=test_reader, feed_order=['x', 'y'])
-                print test_metrics
+                print(test_metrics)
                '''
                ...
                ['25.768919467926025']

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
@@ -28,11 +28,12 @@ images per class.
 """
-import cPickle
 import itertools
 import numpy
 import paddle.v2.dataset.common
 import tarfile
+from six.moves import cPickle as pickle
+from six.moves import zip
 __all__ = ['train10']
@@ -46,7 +47,7 @@ def reader_creator(filename, sub_name, batch_size=None):
        data = batch['data']
        labels = batch.get('labels', batch.get('fine_labels', None))
        assert labels is not None
-        for sample, label in itertools.izip(data, labels):
+        for sample, label in zip(data, labels):
            yield (sample / 255.0).astype(numpy.float32), int(label)
    def reader():
@@ -56,7 +57,7 @@ def reader_creator(filename, sub_name, batch_size=None):
            batch_count = 0
            for name in names:
-                batch = cPickle.load(f.extractfile(name))
+                batch = pickle.load(f.extractfile(name))
                for item in read_batch(batch):
                    if isinstance(batch_size, int) and batch_count > batch_size:
                        break

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 import numpy

--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 import numpy

--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 import numpy as np
@@ -178,14 +176,15 @@ def train(use_cuda, train_program, params_dirname):
            if float(avg_cost) < 100.0:  # Large value to increase CI speed
                trainer.save_params(params_dirname)
            else:
-                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                print(
-                                                              float(avg_cost)))
+                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                             float(avg_cost))))
                if math.isnan(float(avg_cost)):
                    sys.exit("got NaN loss, training failed.")
        elif isinstance(event, fluid.EndStepEvent):
            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
            if event.step == 1:  # Run 2 iterations to speed CI
                trainer.save_params(params_dirname)
                trainer.stop()
@@ -207,14 +206,14 @@ def infer(use_cuda, inference_program, params_dirname):
        inference_program, param_path=params_dirname, place=place)
    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
+    # Here each word is the basic element of the LoDTensor and the shape of
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # each word (base_shape) should be [1] since it is simply an index to
    # look up for the corresponding word vector.
    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
+    # which has only one level of detail. Then the created LoDTensor will have only
-    # one higher level structure (sequence of words, or sentence) than the basic 
+    # one higher level structure (sequence of words, or sentence) than the basic
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # element (word). Hence the LoDTensor will hold data for three sentences of
-    # length 3, 4 and 2, respectively. 
+    # length 3, 4 and 2, respectively.
    # Note that recursive_sequence_lengths should be a list of lists.
    recursive_seq_lens = [[3, 4, 2]]
    base_shape = [1]

--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -250,7 +250,7 @@ def decode_main(use_cuda, is_sparse):
    feeder = fluid.DataFeeder(feed_list, place)
    for data in train_data():
-        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict = feeder.feed([[x[0]] for x in data])
        feed_dict['init_ids'] = init_ids
        feed_dict['init_scores'] = init_scores
@@ -259,7 +259,7 @@ def decode_main(use_cuda, is_sparse):
            feed=feed_dict,
            fetch_list=[translation_ids, translation_scores],
            return_numpy=False)
-        print result_ids.recursive_sequence_lengths()
+        print(result_ids.recursive_sequence_lengths())
        break

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import argparse
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -89,8 +89,10 @@ def train(use_cuda, train_program, params_dirname):
                if math.isnan(avg_cost):
                    sys.exit("got NaN loss, training failed.")
        elif isinstance(event, fluid.EndStepEvent):
-            print("Step {0}, Epoch {1} Metrics {2}".format(
+            print(
-                event.step, event.epoch, map(numpy.array, event.metrics)))
+                ("Step {0}, Epoch {1} Metrics {2}".format(
+                    event.step, event.epoch,
+                    list(map(numpy.array, event.metrics)))))
    train_reader = paddle.batch(
        paddle.reader.shuffle(

--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import argparse
 import paddle.fluid as fluid
 import paddle

--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -186,8 +186,9 @@ def train(use_cuda, train_program, params_dirname):
                trainer.save_params(params_dirname)
                trainer.stop()
            else:
-                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                print(
-                                                              float(avg_cost)))
+                    ('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                             float(avg_cost))))
                if math.isnan(float(avg_cost)):
                    sys.exit("got NaN loss, training failed.")

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -98,7 +96,7 @@ def train(use_cuda, train_program, params_dirname):
                    sys.exit("got NaN loss, training failed.")
        elif isinstance(event, fluid.EndStepEvent):
            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
            if event.step == 1:  # Run 2 iterations to speed CI
                trainer.save_params(params_dirname)
                trainer.stop()
@@ -125,14 +123,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
        place=place)
    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
+    # Here each word is the basic element of the LoDTensor and the shape of
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # each word (base_shape) should be [1] since it is simply an index to
    # look up for the corresponding word vector.
    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
+    # which has only one level of detail. Then the created LoDTensor will have only
-    # one higher level structure (sequence of words, or sentence) than the basic 
+    # one higher level structure (sequence of words, or sentence) than the basic
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # element (word). Hence the LoDTensor will hold data for three sentences of
-    # length 3, 4 and 2, respectively. 
+    # length 3, 4 and 2, respectively.
    # Note that recursive_sequence_lengths should be a list of lists.
    recursive_seq_lens = [[3, 4, 2]]
    base_shape = [1]

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -113,7 +111,7 @@ def train(use_cuda, train_program, params_dirname):
                    sys.exit("got NaN loss, training failed.")
        elif isinstance(event, fluid.EndStepEvent):
            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
            if event.step == 1:  # Run 2 iterations to speed CI
                trainer.save_params(params_dirname)
                trainer.stop()
@@ -140,14 +138,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
        place=place)
    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
+    # Here each word is the basic element of the LoDTensor and the shape of
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # each word (base_shape) should be [1] since it is simply an index to
    # look up for the corresponding word vector.
    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
+    # which has only one level of detail. Then the created LoDTensor will have only
-    # one higher level structure (sequence of words, or sentence) than the basic 
+    # one higher level structure (sequence of words, or sentence) than the basic
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # element (word). Hence the LoDTensor will hold data for three sentences of
-    # length 3, 4 and 2, respectively. 
+    # length 3, 4 and 2, respectively.
    # Note that recursive_sequence_lengths should be a list of lists.
    recursive_seq_lens = [[3, 4, 2]]
    base_shape = [1]

--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 from functools import partial
@@ -107,7 +105,7 @@ def train(use_cuda, train_program, params_dirname):
                    sys.exit("got NaN loss, training failed.")
        elif isinstance(event, fluid.EndStepEvent):
            print("Step {0}, Epoch {1} Metrics {2}".format(
-                event.step, event.epoch, map(np.array, event.metrics)))
+                event.step, event.epoch, list(map(np.array, event.metrics))))
            if event.step == 1:  # Run 2 iterations to speed CI
                trainer.save_params(params_dirname)
                trainer.stop()
@@ -135,14 +133,14 @@ def infer(use_cuda, inference_program, params_dirname=None):
        place=place)
    # Setup input by creating LoDTensor to represent sequence of words.
-    # Here each word is the basic element of the LoDTensor and the shape of 
+    # Here each word is the basic element of the LoDTensor and the shape of
-    # each word (base_shape) should be [1] since it is simply an index to 
+    # each word (base_shape) should be [1] since it is simply an index to
    # look up for the corresponding word vector.
    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-    # which has only one level of detail. Then the created LoDTensor will have only 
+    # which has only one level of detail. Then the created LoDTensor will have only
-    # one higher level structure (sequence of words, or sentence) than the basic 
+    # one higher level structure (sequence of words, or sentence) than the basic
-    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # element (word). Hence the LoDTensor will hold data for three sentences of
-    # length 3, 4 and 2, respectively. 
+    # length 3, 4 and 2, respectively.
    # Note that recursive_sequence_lengths should be a list of lists.
    recursive_seq_lens = [[3, 4, 2]]
    base_shape = [1]

--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 from paddle.fluid.layers.device import get_places
 import unittest
 import paddle.fluid as fluid
@@ -175,7 +175,7 @@ def train(word_dict,
    def train_loop(main_program):
        exe.run(fluid.default_startup_program())
-        for pass_id in xrange(PASS_NUM):
+        for pass_id in range(PASS_NUM):
            for data in train_data():
                cost_val, acc_val = exe.run(main_program,
                                            feed=feeder.feed(data),
@@ -235,14 +235,14 @@ def infer(word_dict, use_cuda, save_dirname=None):
        word_dict_len = len(word_dict)
        # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
+        # Here each word is the basic element of the LoDTensor and the shape of
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # each word (base_shape) should be [1] since it is simply an index to
        # look up for the corresponding word vector.
        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
+        # which has only one level of detail. Then the created LoDTensor will have only
-        # one higher level structure (sequence of words, or sentence) than the basic 
+        # one higher level structure (sequence of words, or sentence) than the basic
-        # element (word). Hence the LoDTensor will hold data for three sentences of 
+        # element (word). Hence the LoDTensor will hold data for three sentences of
-        # length 3, 4 and 2, respectively. 
+        # length 3, 4 and 2, respectively.
        # Note that recursive_sequence_lengths should be a list of lists.
        recursive_seq_lens = [[3, 4, 2]]
        base_shape = [1]

--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -114,7 +114,7 @@ def infer(use_cuda, save_dirname=None):
        test_reader = paddle.batch(
            paddle.dataset.uci_housing.test(), batch_size=batch_size)
-        test_data = test_reader().next()
+        test_data = next(test_reader())
        test_feat = numpy.array(
            [data[0] for data in test_data]).astype("float32")
        test_label = numpy.array(

--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 import contextlib
@@ -121,7 +119,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
    avg_cost = fluid.layers.mean(cost)
    acc = fluid.layers.accuracy(input=predict, label=label)
-    # Test program 
+    # Test program
    test_program = fluid.default_main_program().clone(for_test=True)
    optimizer = fluid.optimizer.Adam(learning_rate=0.001)

--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -181,7 +181,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
        start_time = time.time()
        batch_id = 0
-        for pass_id in xrange(PASS_NUM):
+        for pass_id in range(PASS_NUM):
            for data in train_data():
                cost = exe.run(main_program,
                               feed=feeder.feed(data),
@@ -248,14 +248,14 @@ def infer(use_cuda, save_dirname=None):
         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
        # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
+        # Here each word is the basic element of the LoDTensor and the shape of
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # each word (base_shape) should be [1] since it is simply an index to
        # look up for the corresponding word vector.
        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
+        # which has only one level of detail. Then the created LoDTensor will have only
-        # one higher level structure (sequence of words, or sentence) than the basic 
+        # one higher level structure (sequence of words, or sentence) than the basic
-        # element (word). Hence the LoDTensor will hold data for three sentences of 
+        # element (word). Hence the LoDTensor will hold data for three sentences of
-        # length 3, 4 and 2, respectively. 
+        # length 3, 4 and 2, respectively.
        # Note that recursive_sequence_lengths should be a list of lists.
        recursive_seq_lens = [[3, 4, 2]]
        base_shape = [1]

--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -199,7 +199,7 @@ def train_main(use_cuda, is_sparse, is_local=True):
        feeder = fluid.DataFeeder(feed_list, place)
        batch_id = 0
-        for pass_id in xrange(1):
+        for pass_id in range(1):
            for data in train_data():
                outs = exe.run(main_program,
                               feed=feeder.feed(data),
@@ -273,7 +273,7 @@ def decode_main(use_cuda, is_sparse):
    feeder = fluid.DataFeeder(feed_list, place)
    for data in train_data():
-        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict = feeder.feed([[x[0]] for x in data])
        feed_dict['init_ids'] = init_ids
        feed_dict['init_scores'] = init_scores
@@ -282,7 +282,7 @@ def decode_main(use_cuda, is_sparse):
            feed=feed_dict,
            fetch_list=[translation_ids, translation_scores],
            return_numpy=False)
-        print result_ids.recursive_sequence_lengths()
+        print(result_ids.recursive_sequence_lengths())
        break

--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import paddle.fluid.core as core
 import math

--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -260,15 +260,15 @@ def infer(use_cuda, save_dirname=None):
        # Use the first data from paddle.dataset.movielens.test() as input
        assert feed_target_names[0] == "user_id"
-        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
+        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API
-        # to generate LoD Tensor where `data` is a list of sequences of index 
+        # to generate LoD Tensor where `data` is a list of sequences of index
-        # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+        # numbers, `recursive_sequence_lengths` is the length-based level of detail
        # (lod) info associated with `data`.
        # For example, data = [[10, 2, 3], [2, 3]] means that it contains
        # two sequences of indexes, of length 3 and 2, respectively.
-        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
+        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one
-        # level of detail info, indicating that `data` consists of two sequences 
+        # level of detail info, indicating that `data` consists of two sequences
-        # of length 3 and 2, respectively. 
+        # of length 3 and 2, respectively.
        user_id = fluid.create_lod_tensor([[1]], [[1]], place)
        assert feed_target_names[1] == "gender_id"

--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -175,7 +175,7 @@ def train(use_cuda, save_dirname=None):
    feeder = fluid.DataFeeder(feed_list, place)
    batch_id = 0
-    for pass_id in xrange(2):
+    for pass_id in range(2):
        for data in train_data():
            outs = exe.run(framework.default_main_program(),
                           feed=feeder.feed(data),
@@ -213,14 +213,14 @@ def infer(use_cuda, save_dirname=None):
         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
        # Setup input by creating LoDTensor to represent sequence of words.
-        # Here each word is the basic element of the LoDTensor and the shape of 
+        # Here each word is the basic element of the LoDTensor and the shape of
-        # each word (base_shape) should be [1] since it is simply an index to 
+        # each word (base_shape) should be [1] since it is simply an index to
        # look up for the corresponding word vector.
        # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
-        # which has only one level of detail. Then the created LoDTensor will have only 
+        # which has only one level of detail. Then the created LoDTensor will have only
-        # one higher level structure (sequence of words, or sentence) than the basic 
+        # one higher level structure (sequence of words, or sentence) than the basic
-        # element (word). Hence the LoDTensor will hold data for two sentences of 
+        # element (word). Hence the LoDTensor will hold data for two sentences of
-        # length 4 and 6, respectively. 
+        # length 4 and 6, respectively.
        # Note that recursive_sequence_lengths should be a list of lists.
        recursive_seq_lens = [[4, 6]]
        base_shape = [1]

--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -85,9 +85,11 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            avg_cost, predict_word = __network__(
-                map(pd.read_input, [
+                list(
-                    first_word, second_word, third_word, forth_word, next_word
+                    map(pd.read_input, [
-                ]))
+                        first_word, second_word, third_word, forth_word,
+                        next_word
+                    ])))
            pd.write_output(avg_cost)
        avg_cost = fluid.layers.mean(pd())
@@ -167,11 +169,11 @@ def infer(use_cuda, save_dirname=None):
        word_dict = paddle.dataset.imikolov.build_dict()
        dict_size = len(word_dict)
-        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
+        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
-        # is simply an index to look up for the corresponding word vector and hence 
+        # is simply an index to look up for the corresponding word vector and hence
-        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
+        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths,
-        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
+        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]]
-        # meaning there is only one level of detail and there is only one sequence of 
+        # meaning there is only one level of detail and there is only one sequence of
        # one word on this level.
        # Note that recursive_sequence_lengths should be a list of lists.
        recursive_seq_lens = [[1]]

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -78,7 +78,7 @@ for pass_id in range(PASS_NUM):
        if avg_loss_value[0] < 10.0:
            exit(0)  # if avg cost less than 10.0, we think our code is good.
-        print avg_loss_value[0]
+        print(avg_loss_value[0])
        if math.isnan(float(avg_loss_value)):
            sys.exit("got NaN loss, training failed.")
 exit(1)
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import sys
 import paddle

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -118,7 +118,7 @@ def main():
    feeder = fluid.DataFeeder(feed_list, place)
    batch_id = 0
-    for pass_id in xrange(10):
+    for pass_id in range(10):
        for data in train_data():
            outs = exe.run(fluid.default_main_program(),
                           feed=feeder.feed(data),

--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
@@ -137,7 +137,7 @@ def main():
            generated_img = exe.run(g_program,
                                    feed={'noise': n},
                                    fetch_list={g_img})[0]
-            real_data = numpy.array(map(lambda x: x[0], data)).astype('float32')
+            real_data = numpy.array([x[0] for x in data]).astype('float32')
            real_data = real_data.reshape(num_true, 784)
            total_data = numpy.concatenate([real_data, generated_img])
            total_label = numpy.concatenate([
@@ -150,7 +150,7 @@ def main():
                                feed={'img': total_data,
                                      'label': total_label},
                                fetch_list={d_loss})[0]
-            for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
+            for _ in range(NUM_TRAIN_TIMES_OF_DG):
                n = numpy.random.uniform(
                    low=-1.0, high=1.0,
                    size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(

--- a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
@@ -36,7 +36,7 @@ if len(sys.argv) == 1:
 else:
    word_dict = load_vocab(sys.argv[1])
    word_dict["<unk>"] = len(word_dict)
-print "Dict dim = ", len(word_dict)
+print("Dict dim = ", len(word_dict))
 # input text data
 data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1)

--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
@@ -194,7 +194,7 @@ class TestRoutineOp(unittest.TestCase):
            quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
            with fluid.Go():
-                for i in xrange(10):
+                for i in range(10):
                    fluid.channel_recv(ch1, result)
                    Print(result)

--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -155,7 +155,7 @@ def train_main(use_cuda):
        ]
        feeder = fluid.DataFeeder(feed_list, place)
-        for pass_id in xrange(1):
+        for pass_id in range(1):
            for batch_id, data in enumerate(train_reader()):
                outs = exe.run(main_program,
                               feed=feeder.feed(data),
@@ -204,8 +204,8 @@ def decode_main(use_cuda):
    ]
    feeder = fluid.DataFeeder(feed_list, place)
-    data = train_reader().next()
+    data = next(train_reader())
-    feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+    feed_dict = feeder.feed([[x[0]] for x in data])
    feed_dict['init_ids'] = init_ids
    feed_dict['init_scores'] = init_scores
@@ -214,7 +214,7 @@ def decode_main(use_cuda):
        feed=feed_dict,
        fetch_list=[translation_ids, translation_scores],
        return_numpy=False)
-    print result_ids.lod()
+    print(result_ids.lod())
 class TestBeamSearchDecoder(unittest.TestCase):

--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard

--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 import numpy as np
 import paddle
 import paddle.fluid as fluid

--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -76,15 +76,15 @@ class TestMNISTIfElseOp(unittest.TestCase):
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+                x_data = np.array([x[0] for x in data]).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = np.array([x[1] for x in data]).astype("int64")
                y_data = np.expand_dims(y_data, axis=1)
                outs = exe.run(prog,
                               feed={'x': x_data,
                                     'y': y_data},
                               fetch_list=[avg_loss])
-                print outs[0]
+                print(outs[0])
                if outs[0] < 1.0:
                    return
        self.assertFalse(True)
@@ -131,15 +131,15 @@ class TestMNISTIfElseOp(unittest.TestCase):
        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for data in train_reader():
-                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+                x_data = np.array([x[0] for x in data]).astype("float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = np.array([x[1] for x in data]).astype("int64")
                y_data = y_data.reshape((y_data.shape[0], 1))
                outs = exe.run(prog,
                               feed={'x': x_data,
                                     'y': y_data},
                               fetch_list=[avg_loss])
-                print outs[0]
+                print(outs[0])
                if outs[0] < 1.0:
                    return
        self.assertFalse(True)

--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -16,6 +16,7 @@ import numpy as np
 import unittest
 import time
 import itertools
+import six
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -40,8 +41,8 @@ class BenchmarkSuite(OpTest):
            expect_t = np.array(item_cpu_out)
            actual = item_gpu_out
            actual_t = np.array(item_gpu_out)
-            var_name = variable if isinstance(variable,
+            var_name = variable if isinstance(
-                                              basestring) else variable.name
+                variable, six.string_types) else variable.name
            self.assertTrue(
                np.allclose(
                    actual_t, expect_t, atol=atol),
@@ -53,7 +54,7 @@ class BenchmarkSuite(OpTest):
    def _get_input_names(self):
        inputs = []
-        for name, value in self.inputs.iteritems():
+        for name, value in list(self.inputs.items()):
            if isinstance(value, list):
                inputs.extend([sub_name for sub_name, _ in value])
            inputs.append(name)
@@ -61,7 +62,7 @@ class BenchmarkSuite(OpTest):
    def _get_output_names(self):
        outputs = []
-        for var_name, var in self.outputs.iteritems():
+        for var_name, var in list(self.outputs.items()):
            if isinstance(var, list):
                for sub_var_name, sub_var in var:
                    outputs.append(sub_var_name)

--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -14,6 +14,7 @@
 import numpy as np
 import argparse
+import six
 import time
 import math
@@ -299,7 +300,7 @@ class DistSeResneXt2x2:
            True, loss_name=avg_cost.name, exec_strategy=strategy)
        feed_var_list = [
-            var for var in trainer_prog.global_block().vars.itervalues()
+            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]
@@ -311,7 +312,7 @@ class DistSeResneXt2x2:
                              feed=feeder.feed(data))
        print(first_loss)
-        for i in xrange(5):
+        for i in six.moves.xrange(5):
            data = next(reader_generator)
            loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data))

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -26,13 +26,15 @@ from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, OpProtoHolder, Variable
 from testsuite import create_op, set_input, append_input_output, append_loss_ops
+from functools import reduce
+from six.moves import zip
 def randomize_probability(batch_size, class_num, dtype='float32'):
    prob = np.random.uniform(
        0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
    prob_sum = prob.sum(axis=1)
-    for i in xrange(len(prob)):
+    for i in range(len(prob)):
        prob[i] /= prob_sum[i]
    return prob
@@ -101,7 +103,7 @@ def get_numeric_gradient(place,
    # we only compute gradient of one element each time.
    # we use a for loop to compute the gradient of every element.
-    for i in xrange(tensor_size):
+    for i in range(tensor_size):
        if in_place:
            set_input(scope, op, inputs, place)
@@ -159,7 +161,7 @@ class OpTest(unittest.TestCase):
            assert isinstance(
                numpy_dict,
                dict), "self.inputs, self.outputs must be numpy_dict"
-            for var_name, var_value in numpy_dict.iteritems():
+            for var_name, var_value in numpy_dict.items():
                if isinstance(var_value, (np.ndarray, np.generic)):
                    self.try_call_once(var_value.dtype)
                elif isinstance(var_value, (list, tuple)):
@@ -223,7 +225,7 @@ class OpTest(unittest.TestCase):
    def _get_io_vars(self, block, numpy_inputs):
        inputs = {}
-        for name, value in numpy_inputs.iteritems():
+        for name, value in numpy_inputs.items():
            if isinstance(value, list):
                var_list = [
                    block.var(sub_name) for sub_name, sub_value in value
@@ -266,7 +268,7 @@ class OpTest(unittest.TestCase):
        # if the fetch_list is customized by user, we use it directly.
        # if not, fill the fetch_list by the user configured outputs in test.
        if len(fetch_list) == 0:
-            for var_name, var in outputs.iteritems():
+            for var_name, var in outputs.items():
                if isinstance(var, list):
                    for v in var:
                        fetch_list.append(v)
@@ -278,7 +280,7 @@ class OpTest(unittest.TestCase):
                fetch_list.append(str(out_name))
        # fetch_list = map(block.var, fetch_list)
        if not isinstance(fetch_list[0], fluid.framework.Variable):
-            fetch_list = map(block.var, fetch_list)
+            fetch_list = list(map(block.var, fetch_list))
        outs = executor.run(program,
                            feed=feed_map,
                            fetch_list=fetch_list,
@@ -369,7 +371,7 @@ class OpTest(unittest.TestCase):
    def __assert_is_close(self, numeric_grads, analytic_grads, names,
                          max_relative_error, msg_prefix):
-        for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
+        for a, b, name in zip(numeric_grads, analytic_grads, names):
            abs_a = np.abs(a)
            abs_a[abs_a < 1e-3] = 1
@@ -510,6 +512,6 @@ class OpTest(unittest.TestCase):
                use_cuda=use_cuda, loss_name=loss.name, main_program=prog)
        else:
            executor = Executor(place)
-        return map(np.array,
+        return list(
-                   executor.run(prog, feed_dict, fetch_list,
+            map(np.array,
-                                return_numpy=False))
+                executor.run(prog, feed_dict, fetch_list, return_numpy=False)))
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -91,7 +91,7 @@ class TestParallelExecutorBase(unittest.TestCase):
            first_loss, = run_executor(
                exe=exe, feed=feed_dict, fetch_list=[loss.name])
-            for i in xrange(iter):
+            for i in range(iter):
                run_executor(exe=exe, feed=feed_dict, fetch_list=[])
            last_loss, = run_executor(
@@ -99,8 +99,8 @@ class TestParallelExecutorBase(unittest.TestCase):
            end = time.time()
            if batch_size is not None:
-                print "%.4f Instance per second" % (
+                print("%.4f Instance per second" % (
-                    (batch_size * iter + 2) / (end - begin))
+                    (batch_size * iter + 2) / (end - begin)))
            avg_last_loss_val = np.array(last_loss).mean()
            avg_first_loss_val = np.array(first_loss).mean()
@@ -108,6 +108,6 @@ class TestParallelExecutorBase(unittest.TestCase):
                    float(avg_first_loss_val)):
                sys.exit("got NaN loss, training failed.")
-            print first_loss, last_loss
+            print(first_loss, last_loss)
            # self.assertGreater(first_loss[0], last_loss[0])
            return first_loss, last_loss
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
@@ -26,7 +26,7 @@ class TestAccuracyOp(OpTest):
        label = np.random.randint(0, 2, (n, 1))
        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
        num_correct = 0
-        for rowid in xrange(n):
+        for rowid in range(n):
            for ele in indices[rowid]:
                if ele == label[rowid]:
                    num_correct += 1

--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -273,7 +273,7 @@ class TestSparseAdamOp(unittest.TestCase):
        self.setup(scope, place)
        op_args = dict()
-        for key, np_array in self.dense_inputs.iteritems():
+        for key, np_array in self.dense_inputs.items():
            var = scope.var(key).get_tensor()
            var.set(np_array, place)
            op_args[key] = key
@@ -290,7 +290,7 @@ class TestSparseAdamOp(unittest.TestCase):
        adam_op = Operator("adam", **op_args)
        adam_op.run(scope, place)
-        for key, np_array in self.outputs.iteritems():
+        for key, np_array in self.outputs.items():
            out_var = scope.var(key).get_tensor()
            actual = np.array(out_var)
            actual = actual.reshape([actual.size])

--- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
@@ -80,8 +80,9 @@ class TestArrayReadWrite(unittest.TestCase):
        append_backward(total_sum_scaled)
-        g_vars = map(default_main_program().global_block().var,
+        g_vars = list(
-                     [each_x.name + "@GRAD" for each_x in x])
+            map(default_main_program().global_block().var,
+                [each_x.name + "@GRAD" for each_x in x]))
        g_out = [
            item.sum()
            for item in exe.run(

--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -415,7 +415,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
            self.__assert_close(scale_grad, out[6], "scale_grad")
            self.__assert_close(bias_grad, out[7], "bias_grad")
-            print "op test forward passed: ", str(place), data_layout
+            print("op test forward passed: ", str(place), data_layout)
        places = [core.CPUPlace()]

--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -59,8 +59,7 @@ class BeamSearchOpTester(unittest.TestCase):
            np.allclose(
                np.array(selected_scores),
                np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
-        self.assertEqual(selected_ids.lod(),
+        self.assertEqual(selected_ids.lod(), [[0, 2, 4], [0, 1, 2, 3, 4]])
-                         [[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]])
    def _create_pre_ids(self):
        np_data = np.array([[1, 2, 3, 4]], dtype='int64')

--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
--- a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
--- a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
+++ b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
--- a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
+++ b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
--- a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
+++ b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
--- a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
+++ b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
--- a/python/paddle/fluid/tests/unittests/test_spp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spp_op.py
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
--- a/python/paddle/reader/creator.py
+++ b/python/paddle/reader/creator.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
--- a/python/paddle/trainer/PyDataProviderWrapper.py
+++ b/python/paddle/trainer/PyDataProviderWrapper.py
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/tools/test_runner.py
+++ b/tools/test_runner.py