diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 9b9c6c664158253e4800944a487879b93aaee9a2..b7f2fa96a7cf9993e80477d0cb9cacb88ad3adef 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -31,7 +31,7 @@ paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'pr
 paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd38c5b8b2b2e0bb19bcf1b581a80a7e4'))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
 paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.ParallelExecutor.drop_local_exe_scopes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '80d857dc626612e2b2460d0154551e95'))
+paddle.fluid.ParallelExecutor.drop_local_exe_scopes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '77c739744ea5708b80fb1b37cc89db40'))
 paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '33ce6ec50f8eeb05d340e6b114b026fd'))
 paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', 'b82ea20e2dc5ff2372e0643169ca47ff'))
 paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '74dc6d23185d90a7a50fbac19f5b65fb'))
@@ -313,7 +313,7 @@ paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=No
 paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837'))
 paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f'))
 paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '5b552a1f0f7eb4dacb768a975ba15d08'))
-paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', 'a222dbad457441941e50b812e5af9c7e'))
+paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', 'ee6c70867d317b0a87094ed23546215f'))
 paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '3011dc695f490afdf504dc24f628319a'))
 paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a4e395ab004e7da34e94a0a1f9eee183'))
 paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f2508c52e0a797bb9bd5e29d79ede78'))
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 06a454f4adac9a9110812f52af2676745656f168..5bbbf07e6d9fb8845d3f93d1d8124d3f557dba3c 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -71,6 +71,7 @@ void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {
     if (local_scope_var != nullptr) {
       auto &local_scope = *local_scope_var->GetMutable<Scope *>();
       scope->DeleteScope(local_scope);
+      scope->EraseVars({std::string(details::kLocalExecScopeName)});
       VLOG(3) << "Drop local execution scope: " << local_scope;
     }
   }
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index 200b01797e4ed235863f9093cb8e674ed6d8f701..f686e5293b0f504863e228d37db56c6df4954c24 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -135,33 +135,34 @@ struct Formater {
 };
 
 // TODO(ChunweiYan) there should be some other printers for TensorArray
-class TensorPrintOp : public framework::OperatorBase {
+class PrintOp : public framework::OperatorBase {
  public:
-  TensorPrintOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
+  PrintOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  TensorPrintOp(const TensorPrintOp &o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase &>(o)) {
-    PADDLE_THROW("Not implemented.");
-  }
-
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    const framework::Variable *in_var_ptr = nullptr;
-    std::string printed_var_name = "";
-
-    in_var_ptr = scope.FindVar(Input("In"));
-    printed_var_name = Inputs("In").front();
-
-    PADDLE_ENFORCE_NOT_NULL(in_var_ptr);
-
-    auto &in_tensor = in_var_ptr->Get<framework::LoDTensor>();
+    const auto in_var = scope.FindVar(Input("In"));
+    auto out_var = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE_NOT_NULL(in_var, "The input should not be found in scope",
+                            Input("In"));
+    PADDLE_ENFORCE_NOT_NULL(out_var, "The output should not be found in scope",
+                            Output("Out"));
+    auto &in_tensor = in_var->Get<framework::LoDTensor>();
+    framework::LoDTensor *out_tensor =
+        out_var->GetMutable<framework::LoDTensor>();
+
+    PrintValue(place, Inputs("In").front(), in_tensor);
+    framework::TensorCopy(in_tensor, place, out_tensor);
+    out_tensor->set_lod(in_tensor.lod());
+  }
 
+  void PrintValue(const platform::Place &place,
+                  const std::string &printed_var_name,
+                  const framework::LoDTensor &in_tensor) const {
     std::string print_phase = Attr<std::string>("print_phase");
     bool is_forward = Attr<bool>("is_forward");
 
@@ -177,12 +178,12 @@ class TensorPrintOp : public framework::OperatorBase {
     printed_tensor.set_lod(in_tensor.lod());
     printed_tensor.Resize(in_tensor.dims());
 
-    if (platform::is_cpu_place(in_tensor.place())) {
+    if (is_cpu_place(in_tensor.place())) {
       printed_tensor.ShareDataWith(in_tensor);
     } else {
       // copy data to cpu to print
       platform::CPUPlace place;
-      framework::TensorCopy(in_tensor, place, &printed_tensor);
+      TensorCopy(in_tensor, place, &printed_tensor);
     }
 
     Formater formater;
@@ -215,6 +216,7 @@ class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("In", "Input tensor to be displayed.");
+    AddOutput("Out", "The output tensor.");
     AddAttr<int>("first_n", "Only log `first_n` number of times.");
     AddAttr<std::string>("message", "A string message to print as a prefix.");
     AddAttr<int>("summarize", "Number of elements printed.");
@@ -239,10 +241,23 @@ tensor `t`.)DOC");
   }
 };
 
-class InferShapeForward : public framework::InferShapeBase {
+class PrintOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    VLOG(10) << "PrintOpInferShape";
+    PADDLE_ENFORCE(ctx->HasInput("In"), "Input(In) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    ctx->ShareDim("In", /*->*/ "Out");
+    ctx->ShareLoD("In", /*->*/ "Out");
+  }
+};
+
+class PrintOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null.");
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto input_type = ctx->GetType(ctx->Input("In")[0]);
+    auto out_name = ctx->Output("Out").front();
+    ctx->SetType(out_name, input_type);
   }
 };
 
@@ -253,7 +268,8 @@ class PrintOpGradientMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto *op_desc_ptr = new framework::OpDesc();
     op_desc_ptr->SetType("print");
-    op_desc_ptr->SetInput("In", InputGrad("In"));
+    op_desc_ptr->SetInput("In", OutputGrad("Out"));
+    op_desc_ptr->SetOutput("Out", InputGrad("In"));
     op_desc_ptr->SetAttrMap(Attrs());
     op_desc_ptr->SetAttr("is_forward", false);
     return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
@@ -265,5 +281,6 @@ class PrintOpGradientMaker : public framework::SingleGradOpDescMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker,
-                  ops::PrintOpGradientMaker, ops::InferShapeForward);
+REGISTER_OPERATOR(print, ops::PrintOp, ops::PrintOpProtoAndCheckMaker,
+                  ops::PrintOpGradientMaker, ops::PrintOpInferShape,
+                  ops::PrintOpVarTypeInference);
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 62d66296606a7700d215f74d81ac565a5d0060d5..1407c93a240b03514d576cb9505a9ff7b600b4f5 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -27,7 +27,7 @@ import six
 import numpy as np
 import subprocess
 import multiprocessing
-
+import sys
 from .. import compat as cpt
 from .proto import framework_pb2
 
@@ -82,14 +82,14 @@ def _current_expected_place():
 
 
 def _cpu_num():
-    cpu_num = os.environ.get('CPU_NUM', None)
-    if cpu_num is None:
+    if "CPU_NUM" not in os.environ.keys():
         sys.stderr.write(
             'The CPU_NUM is not specified, you should set CPU_NUM in '
             'the environment variable list, i.e export CPU_NUM=1. CPU_NUM '
             'indicates that how many CPUPlace are used in the current task.\n'
-            '!!! The default number of CPUPlaces is 1.')
+            '!!! The default number of CPUPlaces is 1.\n\n')
         os.environ['CPU_NUM'] = str(1)
+    cpu_num = os.environ.get('CPU_NUM')
     return int(cpu_num)
 
 
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index f8c84a7029024f9a281503d16343544abdb78bcb..d073c15b0239626516c11dad87cc2d04d72eea90 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -165,8 +165,12 @@ def Print(input,
                 print the gradients of input tensor.
 
     Returns:
-        Variable: Output tensor, same data with input tensor.
+        Variable: Output tensor.
 
+    NOTES:
+        The input and output are two different variables, and in the
+        following process, you should use the output variable but not the input,
+        otherwise, the print layer doesn't have backward.
 
     Examples:
         .. code-block:: python
@@ -174,16 +178,18 @@ def Print(input,
            import paddle.fluid as fluid
            
            input = fluid.layers.data(name="input", shape=[4, 32, 32], dtype="float32")
-           fluid.layers.Print(input, message = "The content of input layer:")
+           input = fluid.layers.Print(input, message = "The content of input layer:")
            # value = some_layer(...)
            # Print(value, summarize=10,
            #    message="The content of some_layer: ")
 
     '''
-    helper = LayerHelper('print', **locals())
+    helper = LayerHelper('print' + "_" + input.name, **locals())
+    output = helper.create_variable_for_type_inference(input.dtype)
     helper.append_op(
         type='print',
         inputs={'In': input},
+        outputs={'Out': output},
         attrs={
             'first_n': first_n,
             'summarize': summarize,
@@ -194,7 +200,7 @@ def Print(input,
             'print_tensor_lod': print_tensor_lod,
             'print_phase': print_phase.upper()
         })
-    return input
+    return output
 
 
 class BlockGuard(object):
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index f2cefeb3013c55febf995a7e59bbabcde76b94a3..d4a1041a4bf0566fc5e8e80e28804f1a50f86733 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -324,6 +324,7 @@ class ParallelExecutor(object):
                   loss = fluid.layers.mean(hidden)
 
               place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+              exe = fluid.Executor(place)
               exe.run(startup_program)
 
               parallel_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
index 8e89aa6b7497f905f3b1f53df8ff1b44b28a628a..0fc11ef8d9220dcc6875b6df2a3e527244872e11 100644
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -17,11 +17,13 @@ from __future__ import print_function
 import unittest
 import paddle.fluid.core as core
 from paddle.fluid.executor import Executor
+import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.backward import append_backward
 from paddle.fluid.framework import switch_main_program
 from paddle.fluid.framework import Program
 import numpy as np
+from simple_nets import simple_fc_net, init_data
 
 
 class TestPrintOpCPU(unittest.TestCase):
@@ -89,5 +91,35 @@ class TestPrintOpGPU(TestPrintOpCPU):
         self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
 
+class TestPrintOpBackward(unittest.TestCase):
+    def check_backward(self, use_cuda):
+        main = fluid.Program()
+        startup = fluid.Program()
+
+        with fluid.program_guard(main, startup):
+            loss = simple_fc_net()
+            loss = fluid.layers.Print(loss)
+            fluid.optimizer.Adam().minimize(loss)
+
+        print_ops = [op for op in main.blocks[0].ops if op.type == u'print']
+        assert len(print_ops) == 2, "The number of print op should be 2"
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup)
+
+        binary = fluid.compiler.CompiledProgram(main).with_data_parallel(
+            loss_name=loss.name)
+
+        img, label = init_data()
+        feed_dict = {"image": img, "label": label}
+        exe.run(binary, feed_dict)
+
+    def test_fw_bw(self):
+        if core.is_compiled_with_cuda():
+            self.check_backward(use_cuda=True)
+        self.check_backward(use_cuda=False)
+
+
 if __name__ == '__main__':
     unittest.main()