fix prepare context redundant code problem, optimize executor by cach… (#17743)

* fix prepare context redundant code problem, optimize executor by caching create_varaiables test=develop * cache sub_scope, program, var when use_program_cache=True is set * make fetch_list runable with variables, add more unittest for use_program_cache

fix prepare context redundant code problem, optimize executor by cach… (#17743)
* fix prepare context redundant code problem, optimize executor by caching create_varaiables test=develop * cache sub_scope, program, var when use_program_cache=True is set * make fetch_list runable with variables, add more unittest for use_program_cache
d5239109 · guru4elephant · GitHub · 2c58f1a8 · d5239109 · d5239109
4 changed file
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -247,20 +247,7 @@ static bool has_fetch_operators(
 std::unique_ptr<ExecutorPrepareContext> Executor::PrepareCtxCache(
    const ProgramDesc& program, int block_id,
    const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
-  std::unique_ptr<ExecutorPrepareContext> ctx;
+  return Prepare(program, block_id, skip_ref_cnt_vars, force_disable_gc);
-  ctx.reset(new ExecutorPrepareContext(program, block_id));
-  auto& block = program.Block(block_id);
-  for (auto& op_desc : block.AllOps()) {
-    ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-  }
-#ifdef PADDLE_WITH_NGRAPH
-  if (FLAGS_use_ngraph) {
-    paddle::operators::NgraphEngine::FuseNgraphOps(
-        ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
-  }
-#endif
-  ctx->PrepareUnusedVars(skip_ref_cnt_vars, force_disable_gc);
-  return ctx;
 }
 void Executor::Run(const ProgramDesc& program, Scope* scope,

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -943,6 +943,8 @@ All parameter, weight, gradient are variables in Paddle.
           })
      .def("prepare_ctx_cache", &Executor::PrepareCtxCache,
           py::call_guard<py::gil_scoped_release>())
+      .def("create_variables", &Executor::CreateVariables,
+           py::call_guard<py::gil_scoped_release>())
      .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
                     int block_id, bool create_local_scope, bool create_vars,
                     const std::vector<std::string> &fetch_vars) {

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -361,11 +361,19 @@ class Executor(object):
        self.place = place
        self.program_caches = dict()
        self.ctx_caches = dict()
+        self.scope_caches = dict()
+        self.var_caches = dict()
        p = core.Place()
        p.set_place(self.place)
        self._default_executor = core.Executor(p)
        self._closed = False
+    def _get_var_cache(self, program_cache_key):
+        return self.var_caches.get(program_cache_key, None)
+    def _get_scope_cache(self, program_cache_key):
+        return self.scope_caches.get(program_cache_key, None)
    def _get_ctx_cache(self, program_cache_key):
        return self.ctx_caches.get(program_cache_key, None)
@@ -378,6 +386,12 @@ class Executor(object):
    def _add_ctx_cache(self, ctx_cache_key, ctx):
        self.ctx_caches[ctx_cache_key] = ctx
+    def _add_scope_cache(self, scope_cache_key, scope):
+        self.scope_caches[scope_cache_key] = scope
+    def _add_var_cache(self, var_cache_key, var):
+        self.var_caches[var_cache_key] = var
    def _add_feed_fetch_ops(self, program, feed, fetch_list, feed_var_name,
                            fetch_var_name):
        tmp_program = program.clone()
@@ -689,10 +703,12 @@ class Executor(object):
                "Executor requires Program as its Parameter. But you passed in %s"
                % (type(program)))
-        cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
        if use_program_cache:
+            cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
            cached_program = self._get_program_cache(cache_key)
            cached_ctx = self._get_ctx_cache(cache_key)
+            cached_scope = self._get_scope_cache(cache_key)
+            cached_var = self._get_var_cache(cache_key)
            if cached_program is None:
                cached_program = self._add_feed_fetch_ops(
                    program=program,
@@ -701,13 +717,25 @@ class Executor(object):
                    feed_var_name=feed_var_name,
                    fetch_var_name=fetch_var_name)
                self._add_program_cache(cache_key, cached_program)
+                fetch_list_str = list(map(_to_name_str, fetch_list))
                cached_ctx = self._default_executor.prepare_ctx_cache(
-                    cached_program.desc, 0, fetch_list, False)
+                    cached_program.desc, 0, fetch_list_str, False)
+                cached_var = self._default_executor.create_variables(
+                    cached_program.desc, scope, 0)
+                # currently, we cache program, vars, sub_scope here
+                # we suppose that in a life cycle of training, a user
+                # will not create many programs. So, here the basic
+                # rule of caching is to cache all unseen (program, var, scope)
+                # when a user use use_program_cache.
+                cached_scope = scope.new_scope()
                self._add_ctx_cache(cache_key, cached_ctx)
+                self._add_var_cache(cache_key, cached_var)
+                self._add_scope_cache(cache_key, cached_scope)
            program = cached_program
            ctx = cached_ctx
+            scope = cached_scope
+            var = cached_var
        else:
-            self.program_caches.pop(cache_key, None)
            program = self._add_feed_fetch_ops(
                program=program,
                feed=feed,
@@ -719,7 +747,7 @@ class Executor(object):
        if not use_program_cache:
            exe.run(program.desc, scope, 0, True, True, fetch_var_name)
        else:
-            exe.run_cached_prepared_ctx(ctx, scope, True, True, False)
+            exe.run_cached_prepared_ctx(ctx, scope, False, False, False)
        outs = self._fetch_data(fetch_list, fetch_var_name, scope)
        if return_numpy:
            outs = as_numpy(outs)

--- a/python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy
+import paddle.fluid.core as core
+from paddle.fluid.executor import Executor
+from paddle.fluid.layers import mul, data
+class TestExecutor(unittest.TestCase):
+    def test_mul(self):
+        a = data(name='a', shape=[784], dtype='float32')
+        b = data(
+            name='b',
+            shape=[784, 100],
+            dtype='float32',
+            append_batch_size=False)
+        output = mul(x=a, y=b)
+        place = core.CPUPlace()
+        a_np = numpy.random.random((100, 784)).astype('float32')
+        b_np = numpy.random.random((784, 100)).astype('float32')
+        exe = Executor(place)
+        import time
+        use_cache = True
+        step_num = 3
+        run_time = 0.0
+        for i in range(step_num):
+            begin = time.time()
+            outs = exe.run(feed={'a': a_np,
+                                 'b': b_np},
+                           fetch_list=[output.name],
+                           use_program_cache=use_cache)
+            end = time.time()
+            run_time += end - begin
+            out = outs[0]
+            self.assertEqual((100, 100), out.shape)
+            self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
+        print("run time %f" % run_time)
+        use_cache = False
+        run_time = 0.0
+        for i in range(step_num):
+            begin = time.time()
+            outs = exe.run(feed={'a': a_np,
+                                 'b': b_np},
+                           fetch_list=[output.name],
+                           use_program_cache=use_cache)
+            end = time.time()
+            run_time += end - begin
+            out = outs[0]
+            self.assertEqual((100, 100), out.shape)
+            self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
+        print("run time %f" % run_time)
+        use_cache = True
+        run_time = 0.0
+        for i in range(step_num):
+            begin = time.time()
+            outs = exe.run(feed={'a': a_np,
+                                 'b': b_np},
+                           fetch_list=[output.name],
+                           use_program_cache=use_cache)
+            end = time.time()
+            run_time += end - begin
+            out = outs[0]
+            self.assertEqual((100, 100), out.shape)
+            self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
+        print("run time %f" % run_time)
+        use_cache = True
+        run_time = 0.0
+        for i in range(step_num):
+            begin = time.time()
+            outs = exe.run(feed={'a': a_np,
+                                 'b': b_np},
+                           fetch_list=[output],
+                           use_program_cache=use_cache)
+            end = time.time()
+            run_time += end - begin
+            out = outs[0]
+            self.assertEqual((100, 100), out.shape)
+            self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
+        print("run time %f" % run_time)
+if __name__ == '__main__':
+    unittest.main()