Add a convenience method "forward_input_or_allocate_temp" to use buffer...

Add a convenience method "forward_input_or_allocate_temp" to use buffer forwarding for temporary tensors. Add buffer forwarding in a few additional places, including pooling ops and fused RNN kernels. Change: 149958948

Add a convenience method "forward_input_or_allocate_temp" to use buffer...
Add a convenience method "forward_input_or_allocate_temp" to use buffer forwarding for temporary tensors. Add buffer forwarding in a few additional places, including pooling ops and fused RNN kernels. Change: 149958948
cb4e3951 · A. Unique TensorFlower · TensorFlower Gardener · 35a4183c · cb4e3951 · cb4e3951
7 changed file
--- a/tensorflow/contrib/rnn/kernels/gru_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/gru_ops.cc
@@ -122,9 +122,9 @@ class GRUCellBlockOp : public OpKernel {
                                  &c_tensor));

    Tensor* h_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output("h", TensorShape({batch_size, cell_size}),
-                                  &h_tensor));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"h_prev"}, "h",
+                            TensorShape({batch_size, cell_size}), &h_tensor));

    // Allocate temp tensors.
    Tensor x_h_prev_tensor;
@@ -304,14 +304,15 @@ class GRUBlockCellGradOp : public OpKernel {

    // Create output tensors.
    Tensor* d_x_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output("d_x", TensorShape({batch_size, input_size}),
-                                  &d_x_tensor));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"x"}, "d_x", TensorShape({batch_size, input_size}),
+                            &d_x_tensor));

    Tensor* d_h_prev_tensor = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(
-                            "d_h_prev", TensorShape({batch_size, cell_size}),
-                            &d_h_prev_tensor));
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output(
+                 {"h_prev"}, "d_h_prev", TensorShape({batch_size, cell_size}),
+                 &d_h_prev_tensor));

    Tensor* d_c_bar_tensor;
    OP_REQUIRES_OK(ctx, ctx->allocate_output(

--- a/tensorflow/contrib/rnn/kernels/lstm_ops.cc
+++ b/tensorflow/contrib/rnn/kernels/lstm_ops.cc
@@ -112,9 +112,9 @@ class LSTMBlockCellOp : public OpKernel {

    // Allocate our output tensors.
    Tensor* i_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output("i", TensorShape({batch_size, cell_size}),
-                                  &i_tensor));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"h_prev"}, "i",
+                            TensorShape({batch_size, cell_size}), &i_tensor));

    Tensor* cs_tensor = nullptr;
    OP_REQUIRES_OK(
@@ -127,9 +127,9 @@ class LSTMBlockCellOp : public OpKernel {
                                  &f_tensor));

    Tensor* o_tensor = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->allocate_output("o", TensorShape({batch_size, cell_size}),
-                                  &o_tensor));
+    OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                            {"cs_prev"}, "o",
+                            TensorShape({batch_size, cell_size}), &o_tensor));

    Tensor* ci_tensor = nullptr;
    OP_REQUIRES_OK(
@@ -387,10 +387,10 @@ class LSTMBlockCellGradOp : public OpKernel {

    // Allocate our output tensors.
    Tensor* cs_prev_grad_tensor = nullptr;
-    OP_REQUIRES_OK(ctx,
-                   ctx->allocate_output("cs_prev_grad",
-                                        TensorShape({batch_size, cell_size}),
-                                        &cs_prev_grad_tensor));
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output(
+                 {"cs_grad"}, "cs_prev_grad",
+                 TensorShape({batch_size, cell_size}), &cs_prev_grad_tensor));

    Tensor* dicfo_tensor = nullptr;
    OP_REQUIRES_OK(ctx, ctx->allocate_output(
@@ -398,16 +398,19 @@ class LSTMBlockCellGradOp : public OpKernel {
                            &dicfo_tensor));

    Tensor* wci_grad_tensor = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output("wci_grad", wci_tensor->shape(),
-                                             &wci_grad_tensor));
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output(
+                 {"wci"}, "wci_grad", wci_tensor->shape(), &wci_grad_tensor));

    Tensor* wcf_grad_tensor = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output("wcf_grad", wcf_tensor->shape(),
-                                             &wcf_grad_tensor));
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output(
+                 {"wcf"}, "wcf_grad", wcf_tensor->shape(), &wcf_grad_tensor));

    Tensor* wco_grad_tensor = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output("wco_grad", wco_tensor->shape(),
-                                             &wco_grad_tensor));
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output(
+                 {"wco"}, "wco_grad", wco_tensor->shape(), &wco_grad_tensor));

    // Allocate our temp tensors.
    Tensor do_tensor;

--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -438,6 +438,21 @@ std::unique_ptr<Tensor> OpKernelContext::forward_input(
  return output_tensor;
 }

+Status OpKernelContext::forward_input_or_allocate_temp(
+    gtl::ArraySlice<int> candidate_input_indices, DataType type,
+    const TensorShape& shape, const AllocatorAttributes& allocator_attr,
+    Tensor* out_temp) {
+  for (int input_index : candidate_input_indices) {
+    std::unique_ptr<Tensor> new_tensor =
+        forward_input(input_index, type, shape, DEVICE_MEMORY, allocator_attr);
+    if (new_tensor != nullptr) {
+      *out_temp = std::move(*new_tensor);
+      return Status::OK();
+    }
+  }
+  return allocate_temp(type, shape, out_temp, allocator_attr);
+}
+
 void OpKernelContext::delete_ref_input(int index, bool lock_held) {
  DCHECK_GE(index, 0);
  DCHECK_LT(index, num_inputs());

--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -715,6 +715,21 @@ class OpKernelContext {
      StringPiece output_name, const TensorShape& output_shape,
      Tensor** output) TF_MUST_USE_RESULT;

+  // Tries to reuse one of of the inputs given in input_indices as a temporary.
+  // If none of the given inputs can be forwarded, calls
+  // allocate_temp() to allocate a new temporary buffer.
+  Status forward_input_or_allocate_temp(
+      gtl::ArraySlice<int> candidate_input_indices, DataType type,
+      const TensorShape& shape, const AllocatorAttributes& allocator_attr,
+      Tensor* out_temp) TF_MUST_USE_RESULT;
+
+  Status forward_input_or_allocate_temp(
+      gtl::ArraySlice<int> candidate_input_indices, DataType type,
+      const TensorShape& shape, Tensor* out_temp) TF_MUST_USE_RESULT {
+    return forward_input_or_allocate_temp(candidate_input_indices, type, shape,
+                                          AllocatorAttributes(), out_temp);
+  }
+
  // Output

  // Returns the named list-valued output in "list", as defined in the OpDef.

--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -271,9 +271,9 @@ class FractionalAvgPoolGradOp : public OpKernel {

    // Create intermediate in_backprop.
    Tensor in_backprop_tensor_temp;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<double>::v(), in_shape,
-                                          &in_backprop_tensor_temp));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp(
+                                {0}, DataTypeToEnum<double>::v(), in_shape,
+                                &in_backprop_tensor_temp));
    in_backprop_tensor_temp.flat<double>().setZero();
    // Transform 4D tensor to 2D matrix.
    EigenDoubleMatrixMap in_backprop_tensor_temp_mat(

--- a/tensorflow/core/kernels/fractional_max_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_max_pool_op.cc
@@ -256,9 +256,9 @@ class FractionalMaxPoolGradOp : public OpKernel {
    // Step 1
    // ---------
    Tensor tensor_out_dup;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<T>::v(),
-                                          tensor_out.shape(), &tensor_out_dup));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp(
+                                {1}, DataTypeToEnum<T>::v(), tensor_out.shape(),
+                                &tensor_out_dup));
    Tensor tensor_out_arg_max;
    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
                                                   tensor_out.shape(),

--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -275,9 +275,9 @@ class MaxPoolingGradOp : public OpKernel {
    const TensorShape& output_shape = tensor_in.shape();

    Tensor tensor_out_dup;
-    OP_REQUIRES_OK(context,
-                   context->allocate_temp(DataTypeToEnum<T>::v(),
-                                          tensor_out.shape(), &tensor_out_dup));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_temp(
+                                {1}, DataTypeToEnum<T>::v(), tensor_out.shape(),
+                                &tensor_out_dup));
    Tensor tensor_out_arg_max;
    OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64>::v(),
                                                   tensor_out.shape(),
@@ -552,7 +552,8 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
    TensorShape out_shape({params.tensor_in_batch, params.tensor_in_rows,
                           params.tensor_in_cols, params.depth});
    Tensor* grad_out = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &grad_out));
+    OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                {1}, 0, out_shape, &grad_out));

    LaunchMaxPoolingGradWithArgmax<Device, T>::launch(context, params, grad_in,
                                                      argmax, grad_out);