Add GraphOptimizer to Grappler item builder to do L1 optimizations and

inlining. Op Counts Comparison (BNMT) Counts: Profile vs Grappler Op: Add, 968 vs 965 Op: AddN, 2228 vs 2228 Op: ApplyGradientDescent, 84 vs 84 Op: BatchMatMul, 998 vs 998 Op: Identity, 142 vs 105 Op: MatMul, 63 vs 63 Op: Mul, 10318 vs 10306 Op: OneHot, 1 vs 1 Op: Reshape, 8421 vs 8422 Op: Select, 488 vs 488 Op: Shape, 8132 vs 8131 Op: Sigmoid, 942 vs 942 Op: Softmax, 19 vs 19 Op: StridedSlice, 58 vs 74 Op: Sub, 1398 vs 1394 Op: Tanh, 333 vs 333 Op: Tile, 21 vs 21 Op: Transpose, 39 vs 39 PiperOrigin-RevId: 157288420

Add GraphOptimizer to Grappler item builder to do L1 optimizations and
inlining. Op Counts Comparison (BNMT) Counts: Profile vs Grappler Op: Add, 968 vs 965 Op: AddN, 2228 vs 2228 Op: ApplyGradientDescent, 84 vs 84 Op: BatchMatMul, 998 vs 998 Op: Identity, 142 vs 105 Op: MatMul, 63 vs 63 Op: Mul, 10318 vs 10306 Op: OneHot, 1 vs 1 Op: Reshape, 8421 vs 8422 Op: Select, 488 vs 488 Op: Shape, 8132 vs 8131 Op: Sigmoid, 942 vs 942 Op: Softmax, 19 vs 19 Op: StridedSlice, 58 vs 74 Op: Sub, 1398 vs 1394 Op: Tanh, 333 vs 333 Op: Tile, 21 vs 21 Op: Transpose, 39 vs 39 PiperOrigin-RevId: 157288420
4fb2425f · A. Unique TensorFlower · TensorFlower Gardener · 8918fa9e · 4fb2425f · 4fb2425f
10 changed file
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -49,7 +49,6 @@ static constexpr const char* const kGradientOp =
 static constexpr const char* const kNodeLabel = "Func";
 static constexpr const char* const kFuncAttr =
    FunctionLibraryDefinition::kFuncAttr;
-static constexpr const char* const kNoInlineAttr = "_noinline";

 // Represents the index-th output of a node.
 struct Endpoint {

--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -27,6 +27,8 @@ limitations under the License.

 namespace tensorflow {

+static constexpr const char* const kNoInlineAttr = "_noinline";
+
 // Registers a default customizable kernel creator for a function call.
 //
 // If 'cb()' returns a non-OK, we still fall back to an executor-based

--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -108,3 +108,20 @@ cc_test(
        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
    ],
 )
+
+cc_test(
+    name = "grappler_item_builder_test",
+    srcs = ["grappler_item_builder_test.cc"],
+    deps = [
+        ":grappler_item_builder",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:functional_ops",
+        "//tensorflow/cc:grad_testutil",
+        "//tensorflow/cc:gradients",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -253,6 +253,10 @@ Status GraphProperties::InferDynamically(Cluster* cluster) {
  return Status::OK();
 }

+bool GraphProperties::HasInputProperties(const string& name) const {
+  return input_properties_.find(name) != input_properties_.end();
+}
+
 std::vector<OpInfo::TensorProperties> GraphProperties::GetInputProperties(
    const string& node_name) const {
  auto it = input_properties_.find(node_name);

--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -37,6 +37,7 @@ class GraphProperties {
  Status InferStatically();
  Status InferDynamically(Cluster* cluster);

+  bool HasInputProperties(const string& name) const;
  std::vector<OpInfo::TensorProperties> GetInputProperties(
      const string& node_name) const;
  std::vector<OpInfo::TensorProperties> GetOutputProperties(

--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -344,6 +344,52 @@ Costs& VirtualScheduler::FindOrCreateZero(const string& op_name,
  return it->second;
 }

+bool VirtualScheduler::PopCurrNode() {
+  const auto* node = ready_nodes_->GetCurrNode();
+  auto& node_state = node_map_[node];
+  auto& device = device_[DeviceName(node)];
+  auto curr_time = device.GetCurrTime();
+
+  // Increment num_inputs_ready of the output nodes.
+  for (auto* output : node_state.outputs) {
+    auto& output_state = node_map_[output];
+    output_state.num_inputs_ready++;
+    if (output_state.num_inputs_ready == output_state.inputs.size()) {
+      // This output node is now ready.
+      output_state.time_ready = curr_time;
+      ready_nodes_->AddNode(output);
+    }
+  }
+
+  // Increment num_outputs_executed of the input nodes.
+  for (auto* input : node_state.inputs) {
+    auto& input_state = node_map_[input];
+    input_state.num_outputs_executed++;
+    if (input_state.num_outputs_executed == input_state.outputs.size()) {
+      // All the outputs are executed; no reference to this input nodel
+      input_state.time_no_reference = curr_time;
+      // TODO(dyoon): collect device memory usage; note that this input node
+      // use device memory between time_scheduled and time_no_reference.
+    }
+  }
+
+  // Remove the current node; assume FIFO.
+  ready_nodes_->RemoveCurrNode();
+
+  // Peek at the new node to see if we should skip it.
+  if (!ready_nodes_->Empty()) {
+    if (!use_static_shapes_ &&
+        !graph_properties_.HasInputProperties(GetCurrNodeInfo().name) &&
+        !IsSendOp(node) && !IsRecvOp(node)) {
+      // If infering shapes dynamically and node has no input properties,
+      // it's likely the node is not actually executed. Skip the node.
+      return PopCurrNode();
+    }
+  }
+
+  return !ready_nodes_->Empty();
+}
+
 bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
  // Update graph_costs_ and per-op costs.
  graph_costs_ = CombineCosts(graph_costs_, node_costs);
@@ -385,32 +431,7 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
          << ", scheduled: " << node_state.time_scheduled.count()
          << ", finished: " << node_state.time_finished.count();

-  // Increment num_inputs_ready of the output nodes.
-  for (auto* output : node_state.outputs) {
-    auto& output_state = node_map_[output];
-    output_state.num_inputs_ready++;
-    if (output_state.num_inputs_ready == output_state.inputs.size()) {
-      // This output node is now ready.
-      output_state.time_ready = curr_time;
-      ready_nodes_->AddNode(output);
-    }
-  }
-
-  // Increment num_outputs_executed of the input nodes.
-  for (auto* input : node_state.inputs) {
-    auto& input_state = node_map_[input];
-    input_state.num_outputs_executed++;
-    if (input_state.num_outputs_executed == input_state.outputs.size()) {
-      // All the outputs are executed; no reference to this input nodel
-      input_state.time_no_reference = curr_time;
-      // TODO(dyoon): collect device memory usage; note that this input node
-      // use device memory between time_scheduled and time_no_reference.
-    }
-  }
-
-  // Remove the current node; assume FIFO.
-  ready_nodes_->RemoveCurrNode();
-  return !ready_nodes_->Empty();  // True if not empty.
+  return PopCurrNode();
 }

 Costs VirtualScheduler::Summary() const {

--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -131,6 +131,8 @@ class VirtualScheduler {
  string ChannelDeviceName(const NodeDef* from, const NodeDef* to) const;
  Costs& FindOrCreateZero(const string& op_name,
                          std::map<string, Costs>* op_cost);
+
+  bool PopCurrNode();
  bool IsSendOp(const NodeDef* node) const;
  bool IsRecvOp(const NodeDef* node) const;


--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -19,7 +19,11 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>

+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -33,11 +37,13 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/public/session_options.h"

 namespace tensorflow {
 namespace grappler {

 namespace {
+
 void InitializeTensor(DataType type, Tensor* tensor) {
  const int period = 7;
  if (type == DT_FLOAT) {
@@ -58,96 +64,56 @@ void InitializeTensor(DataType type, Tensor* tensor) {
  }
 }

-// Helper function that returns a bool indicating if there are function
-// call nodes in graph.
-bool HasFunctionInGraph(const Graph& graph) {
-  for (const Node* n : graph.nodes()) {
-    if (graph.flib_def().Find(n->type_string()) != nullptr) {
-      return true;
-    }
-  }
-  return false;
-}
+// Optimize the graph def (including function inlining and other optimizations).
+// This is a temporary change that optimizes the graph in context of a single
+// gpu machine. Down the line, we may want to make grappler_item_builder aware
+// of the cluster type (E.g: single cpu, multiple gpu, etc)  being simulated in
+// order to get the correct session options and environment, and performing the
+// correct optimizations.
+Status OptimizeGraph(const GraphDef& graph_def, GraphDef* output_graph_def) {
+  // Create a session option for a single GPU device.
+  SessionOptions options;

-// Wrapper around FunctionDefToBodyHelper that creates a FunctionBody
-// for function_def.
-Status CreateFunctionBody(const FunctionLibraryDefinition& function_library,
-                          const FunctionDef& function_def,
-                          const NodeDef& node_def,
-                          FunctionBody** function_body) {
-  std::function<Status(const string&, const OpDef**)> get_function_signature =
-      [&function_library](const string& name, const OpDef** signature) {
-        return function_library.LookUpOpDef(name, signature);
-      };
-  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
-      function_def, AttrSlice(node_def), &function_library,
-      get_function_signature, function_body));
-  return Status::OK();
-}
+  // Inline all functions.
+  GraphDef inlined_graph_def(graph_def);
+  for (int i = 0; i < inlined_graph_def.library().function().size(); i++) {
+    FunctionDef* fdef =
+        inlined_graph_def.mutable_library()->mutable_function(i);
+    SetAttrValue(false, &((*fdef->mutable_attr())[kNoInlineAttr]));
+  }

-// Inlines all functions in a Graph.  Does not recursively inline, so if graph
-// contains Function A that calls Function B, calling InlineFunctions once will
-// produce a graph with A inlined but not B.  Calling InlineFunctions a second
-// time will produce a graph with both A and B inlined.
-Status InlineFunctions(Graph* graph) {
-  const FunctionLibraryDefinition& function_library = graph->flib_def();
-  std::vector<std::pair<Node*, FunctionBody*>> nodes_and_funcs_to_inline;
-  std::unordered_map<string, std::unique_ptr<FunctionBody>>
-      function_name_to_body;
-  std::function<Status(const string&, const OpDef**)> get_function_signature =
-      [&function_library](const string& name, const OpDef** signature) {
-        return function_library.LookUpOpDef(name, signature);
-      };
+  // Instantiate all variables for function library runtime creation.
+  std::vector<Device*> devices;
+  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
+      options, "/job:localhost/replica:0/task:0", &devices));
+  std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(devices));
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
+                                             inlined_graph_def.library());
+  Env* env = Env::Default();

-  for (Node* node : graph->nodes()) {
-    const FunctionDef* function_def =
-        function_library.Find(node->type_string());
-    if (!function_def) {
-      // Not a function node.
-      continue;
-    }
-    FunctionBody* function_body = nullptr;
-    const string key = Canonicalize(node->def().op(), AttrSlice(node->def()));
-    if (function_name_to_body.find(key) == function_name_to_body.end()) {
-      TF_RETURN_IF_ERROR(CreateFunctionBody(function_library, *function_def,
-                                            node->def(), &function_body));
-      function_name_to_body.emplace(
-          key, std::unique_ptr<FunctionBody>(function_body));
-    }
-    function_body = function_name_to_body[key].get();
-    if (function_body) {
-      nodes_and_funcs_to_inline.emplace_back(node, function_body);
-    }
-  }
+  // Optimizer options: L1 and inlining. L1 is default.
+  OptimizerOptions* optimizer_opts =
+      options.config.mutable_graph_options()->mutable_optimizer_options();
+  optimizer_opts->set_do_function_inlining(true);

-  for (const auto& iter : nodes_and_funcs_to_inline) {
-    InlineFunctionBody(function_library, graph, iter.first, iter.second);
-  }
-  return Status::OK();
-}
+  // Create the function library runtime.
+  std::unique_ptr<FunctionLibraryRuntime> flib(NewFunctionLibraryRuntime(
+      dvc_mgr.get(), env, devices[0], inlined_graph_def.versions().producer(),
+      &function_library, *optimizer_opts));

-// Sets *inlined_graph to be graph with all function NodeDefs in graph inlined.
-// Recursively inlines, so if graph contains Function A that calls Function B,
-// calling InlineAllFunctions once will produce a graph with both A and B
-// inlined.
-Status InlineAllFunctions(const GraphDef& graph_def,
-                          GraphDef* inlined_graph_def) {
-  *inlined_graph_def = GraphDef::default_instance();
-  // Create a Graph from graph_def. Inlining needs to happen
-  // on a single Graph object in order to guarantee unique
-  // names of nodes created during the inlining process.
+  // Create the GraphOptimizer to optimize the graph def.
  GraphConstructorOptions graph_ctor_opts;
  graph_ctor_opts.allow_internal_ops = true;
  graph_ctor_opts.expect_device_spec = false;
-  FunctionLibraryDefinition function_library(OpRegistry::Global(),
-                                             graph_def.library());
-  Graph inlined_graph(function_library);
-  TF_RETURN_IF_ERROR(
-      ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &inlined_graph));
-  while (HasFunctionInGraph(inlined_graph)) {
-    TF_RETURN_IF_ERROR(InlineFunctions(&inlined_graph));
-  }
-  inlined_graph.ToGraphDef(inlined_graph_def);
+  std::unique_ptr<Graph> graphptr(new Graph(function_library));
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(graph_ctor_opts, inlined_graph_def,
+                                            graphptr.get()));
+
+  // Optimize the graph.
+  GraphOptimizer optimizer(*optimizer_opts);
+  optimizer.Optimize(flib.get(), env, devices[0], &graphptr);
+  graphptr->ToGraphDef(output_graph_def);
+
  return Status::OK();
 }
 }  // namespace
@@ -163,11 +129,12 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
  new_item->id = id;
  new_item->graph = meta_graph.graph_def();

-  if (cfg.inline_functions) {
-    Status s = InlineAllFunctions(meta_graph.graph_def(), &new_item->graph);
-    if (!s.ok()) {
-      LOG(ERROR) << "Unable to inline functions: " << s.error_message()
-                 << ", skipping this input.";
+  // Optimize the graph (function inlining, l1 optimizations, etc).
+  if (cfg.apply_optimizations) {
+    Status optimize_status =
+        OptimizeGraph(meta_graph.graph_def(), &new_item->graph);
+    if (!optimize_status.ok()) {
+      LOG(ERROR) << "Function optimization failed: " << optimize_status;
      return nullptr;
    }
  }

--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -31,7 +31,7 @@ struct ItemConfig {
      : ignore_user_placement(true),
        ignore_colocation(true),
        placeholder_unknown_output_shape_dim(-1),
-        inline_functions(true) {}
+        apply_optimizations(true) {}

  // If true, ignore all user specified node placement.
  bool ignore_user_placement;
@@ -40,8 +40,8 @@ struct ItemConfig {
  // Dimension to use if a placeholder node has an _output_shapes attribute with
  // a dimension of -1.
  int placeholder_unknown_output_shape_dim;
-  // If true, inline all functions in the graph.
-  bool inline_functions;
+  // If true, does inlining and L1 optimizations.
+  bool apply_optimizations;
 };

 // Factory method for creating a GrapplerItem from a MetaGraphDef.

--- a/tensorflow/core/grappler/grappler_item_builder_test.cc
+++ b/tensorflow/core/grappler/grappler_item_builder_test.cc
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/cc/framework/gradients.h"
+#include "tensorflow/cc/gradients/grad_testutil.h"
+#include "tensorflow/cc/ops/functional_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class GrapplerItemBuilderTest : public ::testing::Test {};
+
+// Create a sample graph with a symbolic gradient for sum.
+void SampleSumSymbolicGradientGraphdef(
+    GraphDef *def, CollectionDef *fetches,
+    std::vector<string> *names_of_ops_of_inline) {
+  using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
+
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  auto dummy_variable = Variable(scope, {2, 2}, DT_FLOAT);
+  auto x = Const(scope, 1.0f);
+  auto y = Const(scope, 2);
+  auto z = Const(scope, 3.0f);
+  TF_ASSERT_OK(scope.status());
+
+  NameAttrList fn;
+  fn.set_name("Sum");
+  (*fn.mutable_attr())["T"].set_type(DT_FLOAT);
+  auto g0 = SymbolicGradient(scope, std::initializer_list<Input>{x, y, z},
+                             {DT_FLOAT, DT_INT32}, fn);
+
+  fetches->mutable_node_list()->add_value(g0[0].name());
+
+  TF_CHECK_OK(scope.ToGraphDef(def));
+
+  // Add names of the ops that replace the Mul symbolic gradient during
+  // inlining. This is for validation.
+  *names_of_ops_of_inline = {
+      "SymbolicGradient/dx",          "SymbolicGradient/tile_scaling",
+      "SymbolicGradient/dy_reshaped", "SymbolicGradient/y_shape",
+      "SymbolicGradient/x_shape",     "SymbolicGradient/stitch_idx0",
+      "SymbolicGradient/x_rank",      "SymbolicGradient/stitch_val1",
+      "SymbolicGradient/i_shape",     "SymbolicGradient/di",
+      "SymbolicGradient/zero",        "SymbolicGradient/one"};
+}
+
+std::unique_ptr<GrapplerItem> CreateGrapplerItem(const GraphDef &def,
+                                                 const CollectionDef &fetches) {
+  MetaGraphDef meta_def;
+  ItemConfig cfg;
+  *meta_def.mutable_graph_def() = def;
+  (*meta_def.mutable_collection_def())["train_op"] = fetches;
+  return GrapplerItemFromMetaGraphDef("0", meta_def, cfg);
+}
+
+int CountSymbolicGradientOps(const std::unique_ptr<GrapplerItem> &item) {
+  int n_symb_grads = 0;
+  for (const auto &node : item->graph.node()) {
+    if (node.op() == FunctionLibraryDefinition::kGradientOp) {
+      n_symb_grads++;
+    }
+  }
+  return n_symb_grads;
+}
+
+int CountOpsWithNames(const std::unique_ptr<GrapplerItem> &item,
+                      const std::vector<string> &names) {
+  std::set<string> names_set(names.begin(), names.end());
+  int n_with_names = 0;
+  for (const auto &node : item->graph.node()) {
+    if (names_set.find(node.name()) != names_set.end()) {
+      n_with_names++;
+    }
+  }
+  return n_with_names;
+}
+
+TEST_F(GrapplerItemBuilderTest, SymbolicGradientInlining) {
+  // Create sample sum symbolic gradient graph.
+  GraphDef def;
+  CollectionDef fetches;
+  std::vector<string> ops_of_inline;
+  SampleSumSymbolicGradientGraphdef(&def, &fetches, &ops_of_inline);
+
+  // Create the inlined graph.
+  std::unique_ptr<GrapplerItem> with_inline = CreateGrapplerItem(def, fetches);
+
+  // For the inlined graph, there should be 0 symbolic gradient ops.
+  CHECK_EQ(0, CountSymbolicGradientOps(with_inline));
+
+  // For the inlined graph, make sure all the required expanded op’s are in the
+  // graph.
+  CHECK_EQ(ops_of_inline.size(), CountOpsWithNames(with_inline, ops_of_inline));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow