TensorFlow: merge changes from internal

Change 109945903 Make unsorted_segment_sum detect negative indices Previously it crashed. This fixes #466. Also improve the error message to say which index is problematic. Change 109942557 Fix the conv_grad_input with stride 2. + We always call the Cudnn implementation even if we have an incompatible padding. Base CL: 109948577

TensorFlow: merge changes from internal
Change 109945903 Make unsorted_segment_sum detect negative indices Previously it crashed. This fixes #466. Also improve the error message to say which index is problematic. Change 109942557 Fix the conv_grad_input with stride 2. + We always call the Cudnn implementation even if we have an incompatible padding. Base CL: 109948577
d9cfc64a · Vijay Vasudevan · 475edf8e · d9cfc64a · d9cfc64a · d9cfc64a
24 changed file
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -337,6 +337,7 @@ Status DirectSession::GetOrCreateExecutors(
  for (const auto& graph : graphs) {
    const string& partition_name = graph.first;
    Graph* partition_graph = graph.second;
+    const int graph_def_version = partition_graph->version();

    Device* d;
    s = device_mgr_->LookupDevice(partition_name, &d);
@@ -347,8 +348,10 @@ Status DirectSession::GetOrCreateExecutors(
    LocalExecutorParams params;
    params.has_control_flow = has_control_flow;
    params.device = d;
-    params.create_kernel = [this, d](const NodeDef& ndef, OpKernel** kernel) {
-      return CreateCachedKernel(d, session_handle_, nullptr, ndef, kernel);
+    params.create_kernel = [this, d, graph_def_version](const NodeDef& ndef,
+                                                        OpKernel** kernel) {
+      return CreateCachedKernel(d, session_handle_, nullptr, ndef,
+                                graph_def_version, kernel);
    };
    params.delete_kernel = [this, d](OpKernel* kernel) {
      DeleteCachedKernel(d, session_handle_, kernel);

--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -2140,20 +2140,22 @@ Status NewLocalExecutor(const LocalExecutorParams& params, const Graph* graph,
 }

 Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib,
-                             const NodeDef& ndef, OpKernel** kernel) {
+                             const NodeDef& ndef, int graph_def_version,
+                             OpKernel** kernel) {
  auto device_type = DeviceType(device->attributes().device_type());
  auto allocator = device->GetAllocator(AllocatorAttributes());
-  return CreateOpKernel(device_type, device, allocator, flib, ndef, kernel);
+  return CreateOpKernel(device_type, device, allocator, flib, ndef,
+                        graph_def_version, kernel);
 }

 void DeleteNonCachedKernel(OpKernel* kernel) { delete kernel; }

 Status CreateCachedKernel(Device* device, const string& session,
                          FunctionLibraryRuntime* flib, const NodeDef& ndef,
-                          OpKernel** kernel) {
+                          int graph_def_version, OpKernel** kernel) {
  auto op_seg = device->op_segment();
-  auto create_fn = [device, flib, &ndef](OpKernel** kernel) {
-    return CreateNonCachedKernel(device, flib, ndef, kernel);
+  auto create_fn = [device, flib, &ndef, graph_def_version](OpKernel** kernel) {
+    return CreateNonCachedKernel(device, flib, ndef, graph_def_version, kernel);
  };
  return op_seg->FindOrCreate(session, ndef.name(), kernel, create_fn);
 }

--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -202,7 +202,8 @@ class ExecutorBarrier {
 // access the functions in the "flib". The caller takes ownership of
 // returned "*kernel".
 Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib,
-                             const NodeDef& ndef, OpKernel** kernel);
+                             const NodeDef& ndef, int graph_def_version,
+                             OpKernel** kernel);

 // Deletes "kernel" returned by CreateKernel.
 void DeleteNonCachedKernel(OpKernel* kernel);
@@ -213,7 +214,7 @@ void DeleteNonCachedKernel(OpKernel* kernel);
 // ndef.name(), returns the same kernel instance.
 Status CreateCachedKernel(Device* device, const string& session,
                          FunctionLibraryRuntime* flib, const NodeDef& ndef,
-                          OpKernel** kernel);
+                          int graph_def_version, OpKernel** kernel);

 // Deletes "kernel" returned by CreateCachedKernel.
 void DeleteCachedKernel(Device* device, const string& session,

--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -241,6 +241,7 @@ static const FunctionLibraryRuntime::Handle kInvalidHandle = -1;
 class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 public:
  FunctionLibraryRuntimeImpl(Device* device, Runner runner,
+                             int graph_def_version,
                             const FunctionLibraryDefinition* lib_def);

  ~FunctionLibraryRuntimeImpl() override;
@@ -263,6 +264,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {

  Device* const device_;
  Runner runner_ = nullptr;
+  const int graph_def_version_;
  const FunctionLibraryDefinition* const lib_def_;
  std::function<Status(const string&, const OpDef**)> get_func_sig_;
  std::function<Status(const NodeDef&, OpKernel**)> create_kernel_;
@@ -298,8 +300,12 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 };

 FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
-    Device* device, Runner runner, const FunctionLibraryDefinition* lib_def)
-    : device_(device), runner_(runner), lib_def_(lib_def) {
+    Device* device, Runner runner, int graph_def_version,
+    const FunctionLibraryDefinition* lib_def)
+    : device_(device),
+      runner_(runner),
+      graph_def_version_(graph_def_version),
+      lib_def_(lib_def) {
  get_func_sig_ = [this](const string& op, const OpDef** sig) {
    Status s;
    *sig = lib_def_->LookUp(op, &s);
@@ -368,7 +374,8 @@ const FunctionBody* FunctionLibraryRuntimeImpl::GetFunctionBody(Handle h) {
 Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
                                                OpKernel** kernel) {
  if (ndef.op() != kGradientOp && (lib_def_->Find(ndef.op()) == nullptr)) {
-    return CreateNonCachedKernel(device_, this, ndef, kernel);
+    return CreateNonCachedKernel(device_, this, ndef, graph_def_version_,
+                                 kernel);
  }

  // Try to instantiate this function for the func/attr. Maybe its
@@ -384,7 +391,8 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(const NodeDef& ndef,
  auto device_type = DeviceType(device_->attributes().device_type());
  OpKernelConstruction construction(
      device_type, device_, device_->GetAllocator(AllocatorAttributes()), &ndef,
-      &fbody->fdef.signature(), this, fbody->arg_types, fbody->ret_types, &s);
+      &fbody->fdef.signature(), this, fbody->arg_types, fbody->ret_types,
+      graph_def_version_, &s);
  *kernel = new CallOp(handle, &construction);
  if (!s.ok()) {
    delete kernel;
@@ -628,8 +636,10 @@ bool FunctionLibraryRuntimeImpl::IsDefined(const string& function_name) {
 }

 FunctionLibraryRuntime* NewFunctionLibraryRuntime(
-    Device* device, Runner runner, const FunctionLibraryDefinition* lib_def) {
-  return new FunctionLibraryRuntimeImpl(device, runner, lib_def);
+    Device* device, Runner runner, int graph_def_version,
+    const FunctionLibraryDefinition* lib_def) {
+  return new FunctionLibraryRuntimeImpl(device, runner, graph_def_version,
+                                        lib_def);
 }

 bool RemoveDeadNodes(Graph* g) {

--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -33,7 +33,8 @@ namespace tensorflow {
 typedef std::function<void()> Closure;
 typedef std::function<void(Closure)> Runner;
 FunctionLibraryRuntime* NewFunctionLibraryRuntime(
-    Device* device, Runner runner, const FunctionLibraryDefinition* lib_def);
+    Device* device, Runner runner, int graph_def_version,
+    const FunctionLibraryDefinition* lib_def);

 // FunctionLibraryRuntime::GetFunctionBody returns a description of an
 // instantiated function that is represented as a Graph with arg/ret

--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/platform/port.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"

 #if defined(PLATFORM_GOOGLE)
@@ -66,13 +67,16 @@ Benchmark::Benchmark(const string& device, Graph* g,

  rendez_ = NewLocalRendezvous();

+  const int graph_def_version = g->version();
+
  if (init) {
    Executor* init_exec;
    TF_CHECK_OK(NewLocalExecutor(
        {
            device_, nullptr, false,
-            [this](const NodeDef& ndef, OpKernel** kernel) {
-              return CreateNonCachedKernel(device_, nullptr, ndef, kernel);
+            [this, graph_def_version](const NodeDef& ndef, OpKernel** kernel) {
+              return CreateNonCachedKernel(device_, nullptr, ndef,
+                                           graph_def_version, kernel);
            },
            [](OpKernel* kernel) { DeleteNonCachedKernel(kernel); },
        },
@@ -87,8 +91,9 @@ Benchmark::Benchmark(const string& device, Graph* g,
  TF_CHECK_OK(NewLocalExecutor(
      {
          device_, nullptr, false,
-          [this](const NodeDef& ndef, OpKernel** kernel) {
-            return CreateNonCachedKernel(device_, nullptr, ndef, kernel);
+          [this, graph_def_version](const NodeDef& ndef, OpKernel** kernel) {
+            return CreateNonCachedKernel(device_, nullptr, ndef,
+                                         graph_def_version, kernel);
          },
          [](OpKernel* kernel) { DeleteNonCachedKernel(kernel); },
      },

--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -578,20 +578,19 @@ Status SupportedDeviceTypesForNode(
  return Status::OK();
 }

-std::unique_ptr<OpKernel> CreateOpKernel(DeviceType device_type,
-                                         DeviceBase* device,
-                                         Allocator* allocator,
-                                         const NodeDef& node_def,
-                                         Status* status) {
+std::unique_ptr<OpKernel> CreateOpKernel(
+    DeviceType device_type, DeviceBase* device, Allocator* allocator,
+    const NodeDef& node_def, int graph_def_version, Status* status) {
  OpKernel* kernel = nullptr;
  *status = CreateOpKernel(device_type, device, allocator, nullptr, node_def,
-                           &kernel);
+                           graph_def_version, &kernel);
  return std::unique_ptr<OpKernel>(kernel);
 }

 Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
                      Allocator* allocator, FunctionLibraryRuntime* flib,
-                      const NodeDef& node_def, OpKernel** kernel) {
+                      const NodeDef& node_def, int graph_def_version,
+                      OpKernel** kernel) {
  VLOG(1) << "Instantiating kernel for node: " << SummarizeNodeDef(node_def);

  // Look up the Op registered for this op name.
@@ -629,7 +628,8 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,

  // Everything needed for OpKernel construction.
  OpKernelConstruction context(device_type, device, allocator, &node_def,
-                               op_def, flib, inputs, outputs, &s);
+                               op_def, flib, inputs, outputs, graph_def_version,
+                               &s);
  *kernel = (*registration->factory)(&context);
  if (!s.ok()) {
    delete *kernel;

--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -179,7 +179,8 @@ class OpKernelConstruction {
                       Allocator* allocator, const NodeDef* node_def,
                       const OpDef* op_def, FunctionLibraryRuntime* flib,
                       const DataTypeSlice& input_types,
-                       const DataTypeSlice& output_types, Status* status)
+                       const DataTypeSlice& output_types, int graph_def_version,
+                       Status* status)
      : device_type_(device_type),
        device_(device),
        allocator_(allocator),
@@ -188,6 +189,7 @@ class OpKernelConstruction {
        flib_(flib),
        input_types_(input_types),
        output_types_(output_types),
+        graph_def_version_(graph_def_version),
        status_(status) {}

  Env* env() const { return device_->env(); }
@@ -270,6 +272,9 @@ class OpKernelConstruction {
  // CHECK_NOTNULL(function_library())->Instantiate("Foo", ...).
  FunctionLibraryRuntime* function_library() const { return flib_; }

+  // The GraphDef version whose behavior we should follow.
+  const int graph_def_version() const { return graph_def_version_; }
+
 private:
  const DeviceType device_type_;
  DeviceBase* const device_;
@@ -279,6 +284,7 @@ class OpKernelConstruction {
  FunctionLibraryRuntime* flib_;
  DataTypeSlice input_types_;
  DataTypeSlice output_types_;
+  const int graph_def_version_;
  Status* status_;

  TF_DISALLOW_COPY_AND_ASSIGN(OpKernelConstruction);
@@ -903,10 +909,12 @@ class OpKernelContext {
 std::unique_ptr<OpKernel> CreateOpKernel(DeviceType device_type,
                                         DeviceBase* device,
                                         Allocator* allocator,
-                                         const NodeDef& def, Status* status);
+                                         const NodeDef& def,
+                                         int graph_def_version, Status* status);
 Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
                      Allocator* allocator, FunctionLibraryRuntime* flib,
-                      const NodeDef& def, OpKernel** kernel);
+                      const NodeDef& def, int graph_def_version,
+                      OpKernel** kernel);

 // Returns into 'device_types' the subset of prioritized_types that this
 // binary has registered for the given NodeDef.

--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/public/version.h"

 class DummyKernel : public tensorflow::OpKernel {
 public:
@@ -131,9 +132,9 @@ class OpKernelTest : public ::testing::Test {
                     const DataTypeVector& inputs,
                     const DataTypeVector& outputs) {
    Status status;
-    std::unique_ptr<OpKernel> op(
-        CreateOpKernel(device_type, &device_, cpu_allocator(),
-                       CreateNodeDef(op_type, inputs), &status));
+    std::unique_ptr<OpKernel> op(CreateOpKernel(
+        device_type, &device_, cpu_allocator(), CreateNodeDef(op_type, inputs),
+        TF_GRAPH_DEF_VERSION, &status));
    EXPECT_TRUE(status.ok()) << status;
    EXPECT_TRUE(op != nullptr);
    if (op != nullptr) {
@@ -147,8 +148,9 @@ class OpKernelTest : public ::testing::Test {
    NodeDef node_def;
    protobuf::TextFormat::ParseFromString(ascii_node_def, &node_def);
    Status status;
-    std::unique_ptr<OpKernel> op(CreateOpKernel(
-        device_type, &device_, cpu_allocator(), node_def, &status));
+    std::unique_ptr<OpKernel> op(CreateOpKernel(device_type, &device_,
+                                                cpu_allocator(), node_def,
+                                                TF_GRAPH_DEF_VERSION, &status));
    EXPECT_TRUE(op == nullptr);
    EXPECT_FALSE(status.ok());
    if (!status.ok()) {
@@ -286,7 +288,8 @@ TEST_F(OpKernelTest, SaveTempFalse) {
  Status status;
  std::unique_ptr<OpKernel> op(
      CreateOpKernel(DEVICE_CPU, params.device, cpu_allocator(),
-                     CreateNodeDef("Test1", {DT_FLOAT, DT_INT32}), &status));
+                     CreateNodeDef("Test1", {DT_FLOAT, DT_INT32}),
+                     TF_GRAPH_DEF_VERSION, &status));
  EXPECT_TRUE(status.ok());
  params.op_kernel = op.get();
  OpKernelContext* ctx = new OpKernelContext(params);
@@ -307,7 +310,8 @@ TEST_F(OpKernelTest, SaveTempTrue) {
  Status status;
  std::unique_ptr<OpKernel> op(
      CreateOpKernel(DEVICE_CPU, params.device, cpu_allocator(),
-                     CreateNodeDef("Test1", {DT_FLOAT, DT_INT32}), &status));
+                     CreateNodeDef("Test1", {DT_FLOAT, DT_INT32}),
+                     TF_GRAPH_DEF_VERSION, &status));
  EXPECT_TRUE(status.ok());
  params.op_kernel = op.get();
  OpKernelContext* ctx = new OpKernelContext(params);
@@ -354,8 +358,9 @@ class OpKernelBuilderTest : public ::testing::Test {
    DeviceBase device(env);

    // Test CreateOpKernel()
-    std::unique_ptr<OpKernel> op(
-        CreateOpKernel(device_type, &device, cpu_allocator(), def, &status));
+    std::unique_ptr<OpKernel> op(CreateOpKernel(device_type, &device,
+                                                cpu_allocator(), def,
+                                                TF_GRAPH_DEF_VERSION, &status));
    EXPECT_TRUE(status.ok()) << status;
    EXPECT_TRUE(op != nullptr);
    if (op != nullptr) {
@@ -387,8 +392,9 @@ class OpKernelBuilderTest : public ::testing::Test {
    DeviceBase device(env);

    // Test CreateOpKernel().
-    std::unique_ptr<OpKernel> op(
-        CreateOpKernel(device_type, &device, cpu_allocator(), def, &status));
+    std::unique_ptr<OpKernel> op(CreateOpKernel(device_type, &device,
+                                                cpu_allocator(), def,
+                                                TF_GRAPH_DEF_VERSION, &status));
    EXPECT_TRUE(op == nullptr);
    EXPECT_FALSE(status.ok());
    if (!status.ok()) {

--- a/tensorflow/core/framework/op_segment_test.cc
+++ b/tensorflow/core/framework/op_segment_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/public/version.h"

 namespace tensorflow {

@@ -64,8 +65,8 @@ class OpSegmentTest : public ::testing::Test {
  OpSegment::CreateKernelFn GetFn(const NodeDef* ndef) {
    return [this, ndef](OpKernel** kernel) {
      Status s;
-      auto created =
-          CreateOpKernel(DEVICE_CPU, &device_, cpu_allocator(), *ndef, &s);
+      auto created = CreateOpKernel(DEVICE_CPU, &device_, cpu_allocator(),
+                                    *ndef, TF_GRAPH_DEF_VERSION, &s);
      if (s.ok()) {
        *kernel = created.release();
      }

--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -808,13 +808,13 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
    // TODO(keveman): cuDNN only supports equal padding on both sides, so only
    // calling it when that is true. Remove this check when (if?) cuDNN starts
    // supporting different padding.
-    bool padding_compatible =
-        (padding_rows % 2 == 0) && (padding_cols % 2 == 0);
+    bool rows_odd = (padding_rows % 2 != 0);
+    bool cols_odd = (padding_cols % 2 != 0);

    auto* stream = context->op_device_context<GPUDeviceContext>()->stream();
    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));

-    if (use_cudnn_ && padding_compatible) {
+    if (use_cudnn_) {
      if (filter_rows == 1 && filter_cols == 1 && stride == 1) {
        // 1x1 filter, so call cublas directly.
        const uint64 m = batch * input_rows * input_cols;
@@ -842,10 +842,22 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
        return;
      }

+      TensorShape compatible_input_shape;
+      if (rows_odd || cols_odd) {
+        // If a padding dimension is odd, we have one more element on the right
+        // side or the bottom side. This is unsupported in cudnn. Therefore,
+        // we pad that extra element and make it compatible.
+        compatible_input_shape = TensorShape(
+            {input_shape.dim_size(0), input_shape.dim_size(1) + rows_odd,
+             input_shape.dim_size(2) + cols_odd, input_shape.dim_size(3)});
+      } else {
+        compatible_input_shape = input_shape;
+      }
+
      perftools::gputools::dnn::BatchDescriptor input_desc;
      input_desc.set_count(batch)
-          .set_height(input_rows)
-          .set_width(input_cols)
+          .set_height(compatible_input_shape.dim_size(1))
+          .set_width(compatible_input_shape.dim_size(2))
          .set_feature_map_count(in_depth)
          .set_layout(perftools::gputools::dnn::DataLayout::kBatchDepthYX);
      perftools::gputools::dnn::BatchDescriptor output_desc;
@@ -903,11 +915,15 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
                                       transformed_out_backprop.tensor<T, 4>());

      Tensor pre_transformed_in_backprop;
-      OP_REQUIRES_OK(context,
-                     context->allocate_temp(
-                         DataTypeToEnum<T>::value,
-                         TensorShape({batch, in_depth, input_rows, input_cols}),
-                         &pre_transformed_in_backprop));
+      OP_REQUIRES_OK(context, context->allocate_temp(
+                                  DataTypeToEnum<T>::value,
+                                  TensorShape({
+                                      compatible_input_shape.dim_size(0),
+                                      compatible_input_shape.dim_size(3),
+                                      compatible_input_shape.dim_size(1),
+                                      compatible_input_shape.dim_size(2),
+                                  }),
+                                  &pre_transformed_in_backprop));

      auto out_backprop_ptr =
          AsDeviceMemory(transformed_out_backprop.template flat<T>().data(),
@@ -937,6 +953,28 @@ class Conv2DSlowBackpropInputOp : public OpKernel {
            filter_shape.DebugString(), ")"));
      }

+      if (rows_odd || cols_odd) {
+        Tensor in_backprop_remove_padding;
+        OP_REQUIRES_OK(context,
+                       context->allocate_temp(
+                           DataTypeToEnum<T>::value,
+                           TensorShape({
+                               input_shape.dim_size(0), input_shape.dim_size(3),
+                               input_shape.dim_size(1), input_shape.dim_size(2),
+                           }),
+                           &in_backprop_remove_padding));
+
+        // Remove the padding for odd rows or cols.
+        functor::PadInput<GPUDevice, T, int>()(
+            context->template eigen_device<GPUDevice>(),
+            To32Bit(const_cast<const Tensor&>(pre_transformed_in_backprop)
+                        .tensor<T, 4>()),
+            0, -rows_odd, 0, -cols_odd,
+            To32Bit(in_backprop_remove_padding.tensor<T, 4>()));
+
+        pre_transformed_in_backprop = in_backprop_remove_padding;
+      }
+
      auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
      functor::NCHWToNHWC<Device, T>()(
          context->eigen_device<Device>(),

--- a/tensorflow/core/kernels/core_ops_test.cc
+++ b/tensorflow/core/kernels/core_ops_test.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/port.h"

@@ -432,8 +433,9 @@ static void BM_LRNFloat(int iters, int depth, int cols, int rows,
                  .Finalize(&lrn_node_def));

  Status status;
-  std::unique_ptr<OpKernel> op(CreateOpKernel(
-      DEVICE_CPU, device.get(), cpu_allocator(), lrn_node_def, &status));
+  std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
+                                              cpu_allocator(), lrn_node_def,
+                                              TF_GRAPH_DEF_VERSION, &status));
  TF_CHECK_OK(status);

  OpKernelContext::Params params;
@@ -516,8 +518,9 @@ static void BM_AvgPool(int iters, int batch_size, int rows, int cols, int depth,
                      .Finalize(&avgpool_node_def);
  TF_CHECK_OK(status);

-  std::unique_ptr<OpKernel> op(CreateOpKernel(
-      DEVICE_CPU, device.get(), cpu_allocator(), avgpool_node_def, &status));
+  std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
+                                              cpu_allocator(), avgpool_node_def,
+                                              TF_GRAPH_DEF_VERSION, &status));
  TF_CHECK_OK(status);
  OpKernelContext::Params params;
  params.device = device.get();
@@ -623,8 +626,9 @@ static void BM_AvgPoolBk(int iters, int batch_size, int rows, int cols,
               .Attr("padding", padding == VALID ? "VALID" : "SAME")
               .Finalize(&avgpool_grad_node_def);
  TF_CHECK_OK(status);
-  std::unique_ptr<OpKernel> op(CreateOpKernel(
-      DEVICE_CPU, nullptr, cpu_allocator(), avgpool_grad_node_def, &status));
+  std::unique_ptr<OpKernel> op(
+      CreateOpKernel(DEVICE_CPU, nullptr, cpu_allocator(),
+                     avgpool_grad_node_def, TF_GRAPH_DEF_VERSION, &status));
  TF_CHECK_OK(status);
  OpKernelContext::Params params;
  params.device = device.get();
@@ -712,8 +716,9 @@ static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth,
                      .Attr("padding", padding == VALID ? "VALID" : "SAME")
                      .Finalize(&maxpool_node_def);
  TF_CHECK_OK(status);
-  std::unique_ptr<OpKernel> op(CreateOpKernel(
-      DEVICE_CPU, device.get(), cpu_allocator(), maxpool_node_def, &status));
+  std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
+                                              cpu_allocator(), maxpool_node_def,
+                                              TF_GRAPH_DEF_VERSION, &status));
  TF_CHECK_OK(status);
  OpKernelContext::Params params;
  params.device = device.get();
@@ -889,8 +894,9 @@ static void BM_ReluFloat(int iters, int batch_size, int rows, int cols,
                      .Input(FakeInput(DT_FLOAT))
                      .Finalize(&relu_node_def);
  TF_CHECK_OK(status);
-  std::unique_ptr<OpKernel> op(CreateOpKernel(
-      DEVICE_CPU, device.get(), cpu_allocator(), relu_node_def, &status));
+  std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
+                                              cpu_allocator(), relu_node_def,
+                                              TF_GRAPH_DEF_VERSION, &status));
  TF_CHECK_OK(status);
  OpKernelContext::Params params;
  params.device = device.get();
@@ -960,8 +966,9 @@ static void BM_ImageNetSoftmaxFwd(int iters, int batch_size, int node_depth,
                  .Input("input", 0, DT_FLOAT)
                  .Finalize(&softmax_node_def));
  Status status;
-  std::unique_ptr<OpKernel> op(CreateOpKernel(
-      DEVICE_CPU, device.get(), cpu_allocator(), softmax_node_def, &status));
+  std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
+                                              cpu_allocator(), softmax_node_def,
+                                              TF_GRAPH_DEF_VERSION, &status));
  TF_CHECK_OK(status);
  OpKernelContext::Params params;
  params.device = device.get();

--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/util.h"

 namespace tensorflow {

@@ -57,7 +58,7 @@ class DynamicPartitionOp_Shared : public OpKernel {
      const int32 p = e_partitions(i);
      OP_REQUIRES(c, p >= 0 && p < num_partitions_,
                  errors::InvalidArgument(
-                      "partitions", SliceString((*partitions)->shape(), i),
+                      "partitions", SliceDebugString((*partitions)->shape(), i),
                      " = ", p, " is not in [0, ", num_partitions_, ")"));
      partition_count[p]++;
    }
@@ -77,30 +78,6 @@ class DynamicPartitionOp_Shared : public OpKernel {

 protected:
  int num_partitions_;
-
-  static string SliceString(const TensorShape& shape, const int64 flat) {
-    // Special case rank 0 and 1
-    const int dims = shape.dims();
-    if (dims == 0) return "";
-    if (dims == 1) return strings::StrCat("[", flat, "]");
-
-    // Compute strides
-    gtl::InlinedVector<int64, 32> strides(dims);
-    strides.back() = 1;
-    for (int i = dims - 2; i >= 0; i--) {
-      strides[i] = strides[i + 1] * shape.dim_size(i + 1);
-    }
-
-    // Unflatten index
-    int64 left = flat;
-    string result;
-    for (int i = 0; i < dims; i++) {
-      strings::StrAppend(&result, i ? "," : "[", left / strides[i]);
-      left %= strides[i];
-    }
-    strings::StrAppend(&result, "]");
-    return result;
-  }
 };

 template <class T>

--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/status.h"
 #include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"

 namespace tensorflow {
@@ -80,7 +81,7 @@ class OpsTestBase : public ::testing::Test {
  Status InitOp() {
    Status status;
    kernel_ = CreateOpKernel(device_type_, device_.get(), allocator(),
-                             node_def_, &status);
+                             node_def_, TF_GRAPH_DEF_VERSION, &status);
    if (kernel_ != nullptr) input_types_ = kernel_->input_types();
    return status;
  }

--- a/tensorflow/core/kernels/restore_op_test.cc
+++ b/tensorflow/core/kernels/restore_op_test.cc
@@ -72,8 +72,9 @@ TEST_F(RestoreOpTest, RestoreInt) {
    gtl::InlinedVector<TensorValue, 4> inputs;

    Status status;
-    std::unique_ptr<OpKernel> op(CreateOpKernel(
-        DEVICE_CPU, device.get(), cpu_allocator(), save, &status));
+    std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
+                                                cpu_allocator(), save,
+                                                TF_GRAPH_DEF_VERSION, &status));
    EXPECT_OK(status);

    // Run it
@@ -153,8 +154,9 @@ TEST_F(RestoreOpTest, RestoreFloat) {
    gtl::InlinedVector<TensorValue, 4> inputs;

    Status status;
-    std::unique_ptr<OpKernel> op(CreateOpKernel(
-        DEVICE_CPU, device.get(), cpu_allocator(), save, &status));
+    std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
+                                                cpu_allocator(), save,
+                                                TF_GRAPH_DEF_VERSION, &status));
    EXPECT_OK(status);

    // Run it
@@ -249,8 +251,9 @@ TEST_F(RestoreSliceOpTest, RestoreInt) {
    gtl::InlinedVector<TensorValue, 4> inputs;

    Status status;
-    std::unique_ptr<OpKernel> op(CreateOpKernel(
-        DEVICE_CPU, device.get(), cpu_allocator(), save, &status));
+    std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
+                                                cpu_allocator(), save,
+                                                TF_GRAPH_DEF_VERSION, &status));
    EXPECT_OK(status);

    // Run it

--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/status.h"
 #include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/util/util.h"

 namespace tensorflow {

@@ -198,11 +199,12 @@ class UnsortedSegmentSumOp : public OpKernel {
    const int32 N = segment_flat.dimension(0);
    const int32 output_rows = num_segments.scalar<int32>()();

-    if (N > 0) {
-      Eigen::Tensor<Index, 0, Eigen::RowMajor> m = segment_flat.maximum();
-      OP_REQUIRES(
-          context, m() < output_rows,
-          errors::InvalidArgument("More segments found than output size"));
+    for (int i = 0; i < N; i++) {
+      int j = segment_flat(i);
+      OP_REQUIRES(context, 0 <= j && j < output_rows,
+                  errors::InvalidArgument(
+                      "segment_ids", SliceDebugString(segment_ids.shape(), i),
+                      " = ", j, " is out of range [0, ", output_rows, ")"));
    }

    TensorShape output_shape;

--- a/tensorflow/core/kernels/segment_reduction_ops_test.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/tensor.h"
+#include "tensorflow/core/public/version.h"

 namespace tensorflow {

@@ -63,8 +64,9 @@ static void BM_SegmentReduction(int iters, string reduction, Index num_rows,
                  .Input(FakeInput(DataTypeToEnum<Index>::v()))
                  .Finalize(&reduction_node_def));
  Status status;
-  std::unique_ptr<OpKernel> reduction_op(CreateOpKernel(
-      DEVICE_CPU, device.get(), cpu_allocator(), reduction_node_def, &status));
+  std::unique_ptr<OpKernel> reduction_op(
+      CreateOpKernel(DEVICE_CPU, device.get(), cpu_allocator(),
+                     reduction_node_def, TF_GRAPH_DEF_VERSION, &status));
  OpKernelContext::Params params;
  params.device = device.get();
  params.frame_iter = FrameAndIter(0, 0);

--- a/tensorflow/core/kernels/sparse_to_dense_op_test.cc
+++ b/tensorflow/core/kernels/sparse_to_dense_op_test.cc
@@ -246,8 +246,9 @@ static void BM_SparseToDense(int iters, const int bm_arg) {
                  .Finalize(&sparse_node_def));

  Status status;
-  std::unique_ptr<OpKernel> op(CreateOpKernel(
-      DEVICE_CPU, device.get(), cpu_allocator(), sparse_node_def, &status));
+  std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
+                                              cpu_allocator(), sparse_node_def,
+                                              TF_GRAPH_DEF_VERSION, &status));

  OpKernelContext::Params params;
  params.device = device.get();

--- a/tensorflow/core/util/util.cc
+++ b/tensorflow/core/util/util.cc
@@ -15,7 +15,10 @@ limitations under the License.

 #include "tensorflow/core/util/util.h"

+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+
 namespace tensorflow {

 StringPiece NodeNamePrefix(const StringPiece& op_name) {
@@ -93,4 +96,28 @@ string PrintMemory(const char* ptr, int n) {
  return ret;
 }

+string SliceDebugString(const TensorShape& shape, const int64 flat) {
+  // Special case rank 0 and 1
+  const int dims = shape.dims();
+  if (dims == 0) return "";
+  if (dims == 1) return strings::StrCat("[", flat, "]");
+
+  // Compute strides
+  gtl::InlinedVector<int64, 32> strides(dims);
+  strides.back() = 1;
+  for (int i = dims - 2; i >= 0; i--) {
+    strides[i] = strides[i + 1] * shape.dim_size(i + 1);
+  }
+
+  // Unflatten index
+  int64 left = flat;
+  string result;
+  for (int i = 0; i < dims; i++) {
+    strings::StrAppend(&result, i ? "," : "[", left / strides[i]);
+    left %= strides[i];
+  }
+  strings::StrAppend(&result, "]");
+  return result;
+}
+
 }  // namespace tensorflow
--- a/tensorflow/core/util/util.h
+++ b/tensorflow/core/util/util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_UTIL_UTIL_H_

 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/public/tensor_shape.h"

 namespace tensorflow {

@@ -50,6 +51,11 @@ class MovingAverage {
 // like "00 01 ef cd cd ef".
 string PrintMemory(const char* ptr, int n);

+// Given a flattened index into a tensor, computes a string s so that
+// StrAppend("tensor", s) is a Python indexing expression.  E.g.,
+// "tensor", "tensor[i]", "tensor[i, j]", etc.
+string SliceDebugString(const TensorShape& shape, const int64 flat);
+
 }  // namespace tensorflow

 #endif  // TENSORFLOW_UTIL_UTIL_H_
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -236,14 +236,14 @@ py_test(
 )

 tf_gen_op_wrapper_py(
-    name = "test_kernel_label_op",
-    out = "framework/test_kernel_label_op.py",
-    deps = [":test_kernel_label_op_kernel"],
+    name = "test_ops",
+    out = "framework/test_ops.py",
+    deps = [":test_ops_kernels"],
 )

 cc_library(
-    name = "test_kernel_label_op_kernel",
-    srcs = ["framework/test_kernel_label_op.cc"],
+    name = "test_ops_kernels",
+    srcs = ["framework/test_ops.cc"],
    linkstatic = 1,
    deps = ["//tensorflow/core:framework"],
    alwayslink = 1,
@@ -259,7 +259,7 @@ py_test(
        ":ops",
        ":platform_test",
        ":session",
-        ":test_kernel_label_op",
+        ":test_ops",
    ],
 )

@@ -751,7 +751,7 @@ tf_cuda_library(
    hdrs = ["client/tf_session_helper.h"],
    deps = [
        ":construction_fails_op",
-        ":test_kernel_label_op_kernel",
+        ":test_ops_kernels",
        "//tensorflow/core",
        "//tensorflow/core:direct_session",
        "//tensorflow/core:kernels",

--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -24,8 +24,9 @@ from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_kernel_label_op
+from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework import versions
 from tensorflow.python.ops import common_shapes
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
@@ -927,21 +928,21 @@ class KernelLabelTest(test_util.TensorFlowTestCase):
  def testNoLabel(self):
    with self.test_session():
      self.assertAllEqual(b"My label is: default",
-                          test_kernel_label_op.kernel_label().eval())
+                          test_ops.kernel_label().eval())

  def testLabelMap(self):
    with self.test_session() as sess:
-      default_1 = test_kernel_label_op.kernel_label()
+      default_1 = test_ops.kernel_label()
      # pylint: disable=protected-access
      with sess.graph._kernel_label_map({"KernelLabel": "overload_1"}):
-        overload_1_1 = test_kernel_label_op.kernel_label()
+        overload_1_1 = test_ops.kernel_label()
        with sess.graph._kernel_label_map({"KernelLabel": "overload_2"}):
-          overload_2 = test_kernel_label_op.kernel_label()
+          overload_2 = test_ops.kernel_label()
          with sess.graph._kernel_label_map({"KernelLabel": ""}):
-            default_2 = test_kernel_label_op.kernel_label()
-        overload_1_2 = test_kernel_label_op.kernel_label()
+            default_2 = test_ops.kernel_label()
+        overload_1_2 = test_ops.kernel_label()
      # pylint: enable=protected-access
-      default_3 = test_kernel_label_op.kernel_label()
+      default_3 = test_ops.kernel_label()

      self.assertAllEqual(b"My label is: default", default_1.eval())
      self.assertAllEqual(b"My label is: default", default_2.eval())
@@ -951,5 +952,18 @@ class KernelLabelTest(test_util.TensorFlowTestCase):
      self.assertAllEqual(b"My label is: overload_2", overload_2.eval())


+class GraphDefVersionTest(test_util.TensorFlowTestCase):
+
+  def testGraphDefVersion(self):
+    """Test that the graphdef version is plumbed through to kernels."""
+    for version in range(versions.GRAPH_DEF_VERSION_MIN,
+                         versions.GRAPH_DEF_VERSION_MAX + 1):
+      with ops.Graph().as_default() as g:
+        g.graph_def_version = version
+        with self.test_session(graph=g):
+          v = test_ops.graph_def_version().eval()
+          self.assertEqual(version, v)
+
+
 if __name__ == "__main__":
  googletest.main()
--- a/tensorflow/python/framework/test_kernel_label_op.cc
+++ b/tensorflow/python/framework/test_kernel_label_op.cc
@@ -21,6 +21,8 @@ namespace tensorflow {

 REGISTER_OP("KernelLabel").Output("result: string");

+REGISTER_OP("GraphDefVersion").Output("version: int32");
+
 namespace {
 enum KernelLabel { DEFAULT_LABEL, OVERLOAD_1_LABEL, OVERLOAD_2_LABEL };
 }  // namespace
@@ -59,4 +61,22 @@ REGISTER_KERNEL_BUILDER(Name("KernelLabel")
                            .Label("overload_2"),
                        KernelLabelOp<OVERLOAD_2_LABEL>);

+class GraphDefVersionOp : public OpKernel {
+ public:
+  GraphDefVersionOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    output->scalar<int>()() = graph_def_version_;
+  }
+
+ private:
+  const int graph_def_version_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("GraphDefVersion").Device(DEVICE_CPU),
+                        GraphDefVersionOp);
+
 }  // end namespace tensorflow
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -217,6 +217,14 @@ class UnsortedSegmentSumTest(SegmentReductionHelper):
    self.assertAllClose(unsorted_jacob_t, sorted_jacob_t, rtol=1e-3, atol=1e-3)
    self.assertAllClose(unsorted_jacob_n, sorted_jacob_n, rtol=1e-3, atol=1e-3)

+  def testBadIndices(self):
+    with self.test_session():
+      for bad in [[-1]], [[7]]:
+        unsorted = tf.unsorted_segment_sum([[17]], bad, num_segments=2)
+        with self.assertRaisesOpError(
+            r"segment_ids\[0,0\] = %d is out of range \[0, 2\)" % bad[0][0]):
+          unsorted.eval()
+

 class SparseSegmentReductionHelper(SegmentReductionHelper):