提交 4fb2425f 编写于 作者: A A. Unique TensorFlower 提交者: TensorFlower Gardener

Add GraphOptimizer to Grappler item builder to do L1 optimizations and

inlining.

Op Counts Comparison (BNMT)
Counts: Profile vs Grappler
Op: Add, 968 vs 965
Op: AddN, 2228 vs 2228
Op: ApplyGradientDescent, 84 vs 84
Op: BatchMatMul, 998 vs 998
Op: Identity, 142 vs 105
Op: MatMul, 63 vs 63
Op: Mul, 10318 vs 10306
Op: OneHot, 1 vs 1
Op: Reshape, 8421 vs 8422
Op: Select, 488 vs 488
Op: Shape, 8132 vs 8131
Op: Sigmoid, 942 vs 942
Op: Softmax, 19 vs 19
Op: StridedSlice, 58 vs 74
Op: Sub, 1398 vs 1394
Op: Tanh, 333 vs 333
Op: Tile, 21 vs 21
Op: Transpose, 39 vs 39
PiperOrigin-RevId: 157288420
上级 8918fa9e
......@@ -49,7 +49,6 @@ static constexpr const char* const kGradientOp =
static constexpr const char* const kNodeLabel = "Func";
static constexpr const char* const kFuncAttr =
FunctionLibraryDefinition::kFuncAttr;
static constexpr const char* const kNoInlineAttr = "_noinline";
// Represents the index-th output of a node.
struct Endpoint {
......
......@@ -27,6 +27,8 @@ limitations under the License.
namespace tensorflow {
static constexpr const char* const kNoInlineAttr = "_noinline";
// Registers a default customizable kernel creator for a function call.
//
// If 'cb()' returns a non-OK, we still fall back to an executor-based
......
......@@ -108,3 +108,20 @@ cc_test(
"//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
],
)
cc_test(
name = "grappler_item_builder_test",
srcs = ["grappler_item_builder_test.cc"],
deps = [
":grappler_item_builder",
"//tensorflow/cc:cc_ops",
"//tensorflow/cc:functional_ops",
"//tensorflow/cc:grad_testutil",
"//tensorflow/cc:gradients",
"//tensorflow/core:framework",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
],
)
......@@ -253,6 +253,10 @@ Status GraphProperties::InferDynamically(Cluster* cluster) {
return Status::OK();
}
bool GraphProperties::HasInputProperties(const string& name) const {
return input_properties_.find(name) != input_properties_.end();
}
std::vector<OpInfo::TensorProperties> GraphProperties::GetInputProperties(
const string& node_name) const {
auto it = input_properties_.find(node_name);
......
......@@ -37,6 +37,7 @@ class GraphProperties {
Status InferStatically();
Status InferDynamically(Cluster* cluster);
bool HasInputProperties(const string& name) const;
std::vector<OpInfo::TensorProperties> GetInputProperties(
const string& node_name) const;
std::vector<OpInfo::TensorProperties> GetOutputProperties(
......
......@@ -344,6 +344,52 @@ Costs& VirtualScheduler::FindOrCreateZero(const string& op_name,
return it->second;
}
bool VirtualScheduler::PopCurrNode() {
const auto* node = ready_nodes_->GetCurrNode();
auto& node_state = node_map_[node];
auto& device = device_[DeviceName(node)];
auto curr_time = device.GetCurrTime();
// Increment num_inputs_ready of the output nodes.
for (auto* output : node_state.outputs) {
auto& output_state = node_map_[output];
output_state.num_inputs_ready++;
if (output_state.num_inputs_ready == output_state.inputs.size()) {
// This output node is now ready.
output_state.time_ready = curr_time;
ready_nodes_->AddNode(output);
}
}
// Increment num_outputs_executed of the input nodes.
for (auto* input : node_state.inputs) {
auto& input_state = node_map_[input];
input_state.num_outputs_executed++;
if (input_state.num_outputs_executed == input_state.outputs.size()) {
// All the outputs are executed; no reference to this input nodel
input_state.time_no_reference = curr_time;
// TODO(dyoon): collect device memory usage; note that this input node
// use device memory between time_scheduled and time_no_reference.
}
}
// Remove the current node; assume FIFO.
ready_nodes_->RemoveCurrNode();
// Peek at the new node to see if we should skip it.
if (!ready_nodes_->Empty()) {
if (!use_static_shapes_ &&
!graph_properties_.HasInputProperties(GetCurrNodeInfo().name) &&
!IsSendOp(node) && !IsRecvOp(node)) {
// If infering shapes dynamically and node has no input properties,
// it's likely the node is not actually executed. Skip the node.
return PopCurrNode();
}
}
return !ready_nodes_->Empty();
}
bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
// Update graph_costs_ and per-op costs.
graph_costs_ = CombineCosts(graph_costs_, node_costs);
......@@ -385,32 +431,7 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
<< ", scheduled: " << node_state.time_scheduled.count()
<< ", finished: " << node_state.time_finished.count();
// Increment num_inputs_ready of the output nodes.
for (auto* output : node_state.outputs) {
auto& output_state = node_map_[output];
output_state.num_inputs_ready++;
if (output_state.num_inputs_ready == output_state.inputs.size()) {
// This output node is now ready.
output_state.time_ready = curr_time;
ready_nodes_->AddNode(output);
}
}
// Increment num_outputs_executed of the input nodes.
for (auto* input : node_state.inputs) {
auto& input_state = node_map_[input];
input_state.num_outputs_executed++;
if (input_state.num_outputs_executed == input_state.outputs.size()) {
// All the outputs are executed; no reference to this input nodel
input_state.time_no_reference = curr_time;
// TODO(dyoon): collect device memory usage; note that this input node
// use device memory between time_scheduled and time_no_reference.
}
}
// Remove the current node; assume FIFO.
ready_nodes_->RemoveCurrNode();
return !ready_nodes_->Empty(); // True if not empty.
return PopCurrNode();
}
Costs VirtualScheduler::Summary() const {
......
......@@ -131,6 +131,8 @@ class VirtualScheduler {
string ChannelDeviceName(const NodeDef* from, const NodeDef* to) const;
Costs& FindOrCreateZero(const string& op_name,
std::map<string, Costs>* op_cost);
bool PopCurrNode();
bool IsSendOp(const NodeDef* node) const;
bool IsRecvOp(const NodeDef* node) const;
......
......@@ -19,7 +19,11 @@ limitations under the License.
#include <unordered_set>
#include <vector>
#include "tensorflow/core/common_runtime/device.h"
#include "tensorflow/core/common_runtime/device_factory.h"
#include "tensorflow/core/common_runtime/device_mgr.h"
#include "tensorflow/core/common_runtime/function.h"
#include "tensorflow/core/common_runtime/graph_optimizer.h"
#include "tensorflow/core/framework/attr_value.pb.h"
#include "tensorflow/core/framework/function.h"
#include "tensorflow/core/framework/function.pb.h"
......@@ -33,11 +37,13 @@ limitations under the License.
#include "tensorflow/core/grappler/op_types.h"
#include "tensorflow/core/grappler/utils.h"
#include "tensorflow/core/protobuf/meta_graph.pb.h"
#include "tensorflow/core/public/session_options.h"
namespace tensorflow {
namespace grappler {
namespace {
void InitializeTensor(DataType type, Tensor* tensor) {
const int period = 7;
if (type == DT_FLOAT) {
......@@ -58,96 +64,56 @@ void InitializeTensor(DataType type, Tensor* tensor) {
}
}
// Helper function that returns a bool indicating if there are function
// call nodes in graph.
bool HasFunctionInGraph(const Graph& graph) {
for (const Node* n : graph.nodes()) {
if (graph.flib_def().Find(n->type_string()) != nullptr) {
return true;
}
}
return false;
}
// Optimize the graph def (including function inlining and other optimizations).
// This is a temporary change that optimizes the graph in context of a single
// gpu machine. Down the line, we may want to make grappler_item_builder aware
// of the cluster type (E.g: single cpu, multiple gpu, etc) being simulated in
// order to get the correct session options and environment, and performing the
// correct optimizations.
Status OptimizeGraph(const GraphDef& graph_def, GraphDef* output_graph_def) {
// Create a session option for a single GPU device.
SessionOptions options;
// Wrapper around FunctionDefToBodyHelper that creates a FunctionBody
// for function_def.
Status CreateFunctionBody(const FunctionLibraryDefinition& function_library,
const FunctionDef& function_def,
const NodeDef& node_def,
FunctionBody** function_body) {
std::function<Status(const string&, const OpDef**)> get_function_signature =
[&function_library](const string& name, const OpDef** signature) {
return function_library.LookUpOpDef(name, signature);
};
TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
function_def, AttrSlice(node_def), &function_library,
get_function_signature, function_body));
return Status::OK();
}
// Inline all functions.
GraphDef inlined_graph_def(graph_def);
for (int i = 0; i < inlined_graph_def.library().function().size(); i++) {
FunctionDef* fdef =
inlined_graph_def.mutable_library()->mutable_function(i);
SetAttrValue(false, &((*fdef->mutable_attr())[kNoInlineAttr]));
}
// Inlines all functions in a Graph. Does not recursively inline, so if graph
// contains Function A that calls Function B, calling InlineFunctions once will
// produce a graph with A inlined but not B. Calling InlineFunctions a second
// time will produce a graph with both A and B inlined.
Status InlineFunctions(Graph* graph) {
const FunctionLibraryDefinition& function_library = graph->flib_def();
std::vector<std::pair<Node*, FunctionBody*>> nodes_and_funcs_to_inline;
std::unordered_map<string, std::unique_ptr<FunctionBody>>
function_name_to_body;
std::function<Status(const string&, const OpDef**)> get_function_signature =
[&function_library](const string& name, const OpDef** signature) {
return function_library.LookUpOpDef(name, signature);
};
// Instantiate all variables for function library runtime creation.
std::vector<Device*> devices;
TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
options, "/job:localhost/replica:0/task:0", &devices));
std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(devices));
FunctionLibraryDefinition function_library(OpRegistry::Global(),
inlined_graph_def.library());
Env* env = Env::Default();
for (Node* node : graph->nodes()) {
const FunctionDef* function_def =
function_library.Find(node->type_string());
if (!function_def) {
// Not a function node.
continue;
}
FunctionBody* function_body = nullptr;
const string key = Canonicalize(node->def().op(), AttrSlice(node->def()));
if (function_name_to_body.find(key) == function_name_to_body.end()) {
TF_RETURN_IF_ERROR(CreateFunctionBody(function_library, *function_def,
node->def(), &function_body));
function_name_to_body.emplace(
key, std::unique_ptr<FunctionBody>(function_body));
}
function_body = function_name_to_body[key].get();
if (function_body) {
nodes_and_funcs_to_inline.emplace_back(node, function_body);
}
}
// Optimizer options: L1 and inlining. L1 is default.
OptimizerOptions* optimizer_opts =
options.config.mutable_graph_options()->mutable_optimizer_options();
optimizer_opts->set_do_function_inlining(true);
for (const auto& iter : nodes_and_funcs_to_inline) {
InlineFunctionBody(function_library, graph, iter.first, iter.second);
}
return Status::OK();
}
// Create the function library runtime.
std::unique_ptr<FunctionLibraryRuntime> flib(NewFunctionLibraryRuntime(
dvc_mgr.get(), env, devices[0], inlined_graph_def.versions().producer(),
&function_library, *optimizer_opts));
// Sets *inlined_graph to be graph with all function NodeDefs in graph inlined.
// Recursively inlines, so if graph contains Function A that calls Function B,
// calling InlineAllFunctions once will produce a graph with both A and B
// inlined.
Status InlineAllFunctions(const GraphDef& graph_def,
GraphDef* inlined_graph_def) {
*inlined_graph_def = GraphDef::default_instance();
// Create a Graph from graph_def. Inlining needs to happen
// on a single Graph object in order to guarantee unique
// names of nodes created during the inlining process.
// Create the GraphOptimizer to optimize the graph def.
GraphConstructorOptions graph_ctor_opts;
graph_ctor_opts.allow_internal_ops = true;
graph_ctor_opts.expect_device_spec = false;
FunctionLibraryDefinition function_library(OpRegistry::Global(),
graph_def.library());
Graph inlined_graph(function_library);
TF_RETURN_IF_ERROR(
ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &inlined_graph));
while (HasFunctionInGraph(inlined_graph)) {
TF_RETURN_IF_ERROR(InlineFunctions(&inlined_graph));
}
inlined_graph.ToGraphDef(inlined_graph_def);
std::unique_ptr<Graph> graphptr(new Graph(function_library));
TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(graph_ctor_opts, inlined_graph_def,
graphptr.get()));
// Optimize the graph.
GraphOptimizer optimizer(*optimizer_opts);
optimizer.Optimize(flib.get(), env, devices[0], &graphptr);
graphptr->ToGraphDef(output_graph_def);
return Status::OK();
}
} // namespace
......@@ -163,11 +129,12 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
new_item->id = id;
new_item->graph = meta_graph.graph_def();
if (cfg.inline_functions) {
Status s = InlineAllFunctions(meta_graph.graph_def(), &new_item->graph);
if (!s.ok()) {
LOG(ERROR) << "Unable to inline functions: " << s.error_message()
<< ", skipping this input.";
// Optimize the graph (function inlining, l1 optimizations, etc).
if (cfg.apply_optimizations) {
Status optimize_status =
OptimizeGraph(meta_graph.graph_def(), &new_item->graph);
if (!optimize_status.ok()) {
LOG(ERROR) << "Function optimization failed: " << optimize_status;
return nullptr;
}
}
......
......@@ -31,7 +31,7 @@ struct ItemConfig {
: ignore_user_placement(true),
ignore_colocation(true),
placeholder_unknown_output_shape_dim(-1),
inline_functions(true) {}
apply_optimizations(true) {}
// If true, ignore all user specified node placement.
bool ignore_user_placement;
......@@ -40,8 +40,8 @@ struct ItemConfig {
// Dimension to use if a placeholder node has an _output_shapes attribute with
// a dimension of -1.
int placeholder_unknown_output_shape_dim;
// If true, inline all functions in the graph.
bool inline_functions;
// If true, does inlining and L1 optimizations.
bool apply_optimizations;
};
// Factory method for creating a GrapplerItem from a MetaGraphDef.
......
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/grappler/grappler_item_builder.h"
#include "tensorflow/cc/framework/gradients.h"
#include "tensorflow/cc/gradients/grad_testutil.h"
#include "tensorflow/cc/ops/functional_ops.h"
#include "tensorflow/cc/ops/standard_ops.h"
#include "tensorflow/core/framework/node_def.pb.h"
#include "tensorflow/core/framework/node_def_util.h"
#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/protobuf/meta_graph.pb.h"
namespace tensorflow {
namespace grappler {
namespace {
class GrapplerItemBuilderTest : public ::testing::Test {};
// Create a sample graph with a symbolic gradient for sum.
void SampleSumSymbolicGradientGraphdef(
GraphDef *def, CollectionDef *fetches,
std::vector<string> *names_of_ops_of_inline) {
using namespace ::tensorflow::ops; // NOLINT(build/namespaces)
tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
auto dummy_variable = Variable(scope, {2, 2}, DT_FLOAT);
auto x = Const(scope, 1.0f);
auto y = Const(scope, 2);
auto z = Const(scope, 3.0f);
TF_ASSERT_OK(scope.status());
NameAttrList fn;
fn.set_name("Sum");
(*fn.mutable_attr())["T"].set_type(DT_FLOAT);
auto g0 = SymbolicGradient(scope, std::initializer_list<Input>{x, y, z},
{DT_FLOAT, DT_INT32}, fn);
fetches->mutable_node_list()->add_value(g0[0].name());
TF_CHECK_OK(scope.ToGraphDef(def));
// Add names of the ops that replace the Mul symbolic gradient during
// inlining. This is for validation.
*names_of_ops_of_inline = {
"SymbolicGradient/dx", "SymbolicGradient/tile_scaling",
"SymbolicGradient/dy_reshaped", "SymbolicGradient/y_shape",
"SymbolicGradient/x_shape", "SymbolicGradient/stitch_idx0",
"SymbolicGradient/x_rank", "SymbolicGradient/stitch_val1",
"SymbolicGradient/i_shape", "SymbolicGradient/di",
"SymbolicGradient/zero", "SymbolicGradient/one"};
}
std::unique_ptr<GrapplerItem> CreateGrapplerItem(const GraphDef &def,
const CollectionDef &fetches) {
MetaGraphDef meta_def;
ItemConfig cfg;
*meta_def.mutable_graph_def() = def;
(*meta_def.mutable_collection_def())["train_op"] = fetches;
return GrapplerItemFromMetaGraphDef("0", meta_def, cfg);
}
int CountSymbolicGradientOps(const std::unique_ptr<GrapplerItem> &item) {
int n_symb_grads = 0;
for (const auto &node : item->graph.node()) {
if (node.op() == FunctionLibraryDefinition::kGradientOp) {
n_symb_grads++;
}
}
return n_symb_grads;
}
int CountOpsWithNames(const std::unique_ptr<GrapplerItem> &item,
const std::vector<string> &names) {
std::set<string> names_set(names.begin(), names.end());
int n_with_names = 0;
for (const auto &node : item->graph.node()) {
if (names_set.find(node.name()) != names_set.end()) {
n_with_names++;
}
}
return n_with_names;
}
TEST_F(GrapplerItemBuilderTest, SymbolicGradientInlining) {
// Create sample sum symbolic gradient graph.
GraphDef def;
CollectionDef fetches;
std::vector<string> ops_of_inline;
SampleSumSymbolicGradientGraphdef(&def, &fetches, &ops_of_inline);
// Create the inlined graph.
std::unique_ptr<GrapplerItem> with_inline = CreateGrapplerItem(def, fetches);
// For the inlined graph, there should be 0 symbolic gradient ops.
CHECK_EQ(0, CountSymbolicGradientOps(with_inline));
// For the inlined graph, make sure all the required expanded op’s are in the
// graph.
CHECK_EQ(ops_of_inline.size(), CountOpsWithNames(with_inline, ops_of_inline));
}
} // namespace
} // namespace grappler
} // namespace tensorflow
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册