Make TF_CALL_REAL_NUMBER_TYPES() and related macros include Eigen::half

in addition to float. Explicitly exempt GPU effects that rely on atomics, for which we have no good solution for half yet. Add some fixes in various places (some in Eigen, some in kernels) to make it all compile. Note that there are still ops that don't _declare_ half support (those that use \u201callnumbertypes\u201d or similar do, those that use \u201cfloat, double\u201d don't); these will be fixed in a forthcoming commit. Change: 119409234

Make TF_CALL_REAL_NUMBER_TYPES() and related macros include Eigen::half
in addition to float. Explicitly exempt GPU effects that rely on atomics, for which we have no good solution for half yet. Add some fixes in various places (some in Eigen, some in kernels) to make it all compile. Note that there are still ops that don't _declare_ half support (those that use \u201callnumbertypes\u201d or similar do, those that use \u201cfloat, double\u201d don't); these will be fixed in a forthcoming commit. Change: 119409234
86dfcdc5 · A. Unique TensorFlower · TensorFlower Gardener · 1cd35932 · 86dfcdc5 · 86dfcdc5
13 changed file
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -47,6 +47,7 @@ limitations under the License.
 // Call "m" for all number types that support the comparison operations "<" and
 // ">".
 #define TF_CALL_REAL_NUMBER_TYPES(m) \
+  m(Eigen::half);                    \
  m(float);                          \
  m(double);                         \
  m(int64);                          \
@@ -56,6 +57,7 @@ limitations under the License.
  m(int8)

 #define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m) \
+  m(Eigen::half);                             \
  m(float);                                   \
  m(double);                                  \
  m(int64);                                   \
@@ -85,9 +87,14 @@ limitations under the License.

 // Call "m" on all types supported on GPU.
 #define TF_CALL_GPU_NUMBER_TYPES(m) \
+  m(Eigen::half);                   \
  m(float);                         \
  m(double)

+#define TF_CALL_GPU_NUMBER_TYPES_NO_HALF(m) \
+  m(float);                                 \
+  m(double)
+
 // Call "m" on all quantized types.
 #define TF_CALL_QUANTIZED_TYPES(m) \
  m(qint8);                        \
@@ -97,6 +104,7 @@ limitations under the License.
 #elif defined(__ANDROID_TYPES_FULL__)

 #define TF_CALL_REAL_NUMBER_TYPES(m) \
+  m(Eigen::half);                    \
  m(float);                          \
  m(int32);                          \
  m(int64)
@@ -104,6 +112,7 @@ limitations under the License.
 #define TF_CALL_NUMBER_TYPES(m) TF_CALL_REAL_NUMBER_TYPES(m)

 #define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m) \
+  m(Eigen::half);                             \
  m(float);                                   \
  m(int64)

@@ -114,7 +123,7 @@ limitations under the License.
 #define TF_CALL_ALL_TYPES(m) TF_CALL_REAL_NUMBER_TYPES(m)

 // Maybe we could put an empty macro here for Android?
-#define TF_CALL_GPU_NUMBER_TYPES(m) m(float)
+#define TF_CALL_GPU_NUMBER_TYPES(m) m(float) m(Eigen::half)

 // Call "m" on all quantized types.
 #define TF_CALL_QUANTIZED_TYPES(m) \

--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -271,7 +271,7 @@ class BiasOp<GPUDevice, T> : public BinaryOp<T> {
      Name("BiasAddV1").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
      BiasOp<GPUDevice, type>);

-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL

 template <typename T>
@@ -322,7 +322,7 @@ class BiasGradOp<GPUDevice, T> : public OpKernel {
      Name("BiasAddGrad").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
      BiasGradOp<GPUDevice, type>);

-TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNEL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL

 #endif  // GOOGLE_CUDA

--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -227,7 +227,7 @@ void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
  template struct BiasGPU<T>; \
  template struct BiasGradGPU<T>;

-TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_GPU_SPECS);

 }  // end namespace tensorflow


--- a/tensorflow/core/kernels/constant_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/constant_op_gpu.cu.cc
@@ -81,7 +81,6 @@ struct FillFunctor<GPUDevice, T> {
 #define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>
 TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
 DEFINE_FILL_GPU(bool);
-DEFINE_FILL_GPU(Eigen::half);
 #undef DEFINE_FILL_GPU

 // Partial specialization of FillFunctor<Device=GPUDevice, T>.

--- a/tensorflow/core/kernels/resize_area_op.cc
+++ b/tensorflow/core/kernels/resize_area_op.cc
@@ -103,8 +103,8 @@ class ResizeAreaOp : public OpKernel {
                  j < in_x ? j + 1 - in_x : (j + 1 > in_x1 ? in_x1 - j : 1.0);
              for (int64 c = 0; c < st.channels; ++c) {
 #define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val))))
-                sum_data(c) += input_data(b, BOUND(i, st.in_height),
-                                          BOUND(j, st.in_width), c) *
+                sum_data(c) += float(input_data(b, BOUND(i, st.in_height),
+                                                BOUND(j, st.in_width), c)) *
                               scale_y * scale_x * scale;
 #undef BOUND
              }

--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -63,13 +63,12 @@ class ResizeBilinearOp : public OpKernel {
              std::min(static_cast<int64>(ceilf(in_x)), (st.in_width - 1));
          const float x_lerp = in_x - left_x_index;
          for (int c = 0; c < st.channels; ++c) {
-            const float top_left = input_data(b, top_y_index, left_x_index, c);
-            const float top_right =
-                input_data(b, top_y_index, right_x_index, c);
-            const float bottom_left =
-                input_data(b, bottom_y_index, left_x_index, c);
-            const float bottom_right =
-                input_data(b, bottom_y_index, right_x_index, c);
+            const float top_left(input_data(b, top_y_index, left_x_index, c));
+            const float top_right(input_data(b, top_y_index, right_x_index, c));
+            const float bottom_left(
+                input_data(b, bottom_y_index, left_x_index, c));
+            const float bottom_right(
+                input_data(b, bottom_y_index, right_x_index, c));
            const float top = top_left + (top_right - top_left) * x_lerp;
            const float bottom =
                bottom_left + (bottom_right - bottom_left) * x_lerp;

--- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc
@@ -332,7 +332,7 @@ class ResizeNearestNeighborGPUOpGrad : public OpKernel {
                            .HostMemory("size"),                     \
                          ResizeNearestNeighborGPUOpGrad<T>);

-TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNEL);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_KERNEL);

 #undef REGISTER_KERNEL


--- a/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/resize_nearest_neighbor_op_gpu.cu.cc
@@ -110,8 +110,9 @@ template <typename T>
 bool ResizeNearestNeighborBackward(const T* top_diff, const int batch,
                                   const int in_height, const int in_width,
                                   const int channels, const int out_height,
-                                   const int out_width, const float height_scale,
-				                   const float width_scale, T* bottom_diff,
+                                   const int out_width,
+                                   const float height_scale,
+                                   const float width_scale, T* bottom_diff,
                                   const Eigen::GpuDevice& d) {
  const int output_size = batch * channels * out_height * out_width;
  CudaLaunchConfig output_config = GetCudaLaunchConfig(output_size, d);
@@ -120,11 +121,10 @@ bool ResizeNearestNeighborBackward(const T* top_diff, const int batch,

  const int input_size = batch * channels * in_height * in_width;
  CudaLaunchConfig input_config = GetCudaLaunchConfig(input_size, d);
-  ResizeNearestNeighborBackwardNHWC<T><<<input_config.block_count,
-	                                     input_config.thread_per_block, 0, d.stream()>>>(
-	      input_config.virtual_thread_count, top_diff,
-	      in_height, in_width, channels, out_height,
-	      out_width, height_scale, width_scale, bottom_diff);
+  ResizeNearestNeighborBackwardNHWC<T><<<
+      input_config.block_count, input_config.thread_per_block, 0, d.stream()>>>(
+      input_config.virtual_thread_count, top_diff, in_height, in_width,
+      channels, out_height, out_width, height_scale, width_scale, bottom_diff);
  return d.ok();
 }

@@ -136,7 +136,7 @@ bool ResizeNearestNeighborBackward(const T* top_diff, const int batch,
                               const float width_scale, T* bottom_diff,               \
                               const Eigen::GpuDevice& d);

-TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPEC);

 #undef DECLARE_GPU_SPEC


--- a/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
+++ b/tensorflow/core/kernels/sample_distorted_bounding_box_op.cc
@@ -253,8 +253,8 @@ class SampleDistortedBoundingBoxOp : public OpKernel {
    // Note image_size_data(2) is the depth and unused.
    typename TTypes<T, 1>::ConstTensor image_size_data =
        image_size.tensor<T, 1>();
-    const int32 height = image_size_data(0);
-    const int32 width = image_size_data(1);
+    const int32 height(image_size_data(0));
+    const int32 width(image_size_data(1));

    // Ensure that the supplied bounding boxes are sane and convert them to
    // Rectangles.

--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -216,8 +216,8 @@ TF_CALL_ALL_TYPES(REGISTER_SCATTER_UPDATE_CPU);

 #define REGISTER_SCATTER_UPDATE_GPU(type) REGISTER_SCATTER_UPDATE(type, GPU);

-TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_ADD_SUB_GPU);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_UPDATE_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ADD_SUB_GPU);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_GPU);

 #endif  // GOOGLE_CUDA

@@ -253,7 +253,7 @@ namespace functor {
  DECLARE_GPU_SPECS_INDEX(T, int32); \
  DECLARE_GPU_SPECS_INDEX(T, int64);

-TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_GPU_SPECS);

 #undef DECLARE_GPU_SPECS
 #undef DECLARE_GPU_SPECS_INDEX

--- a/tensorflow/core/kernels/summary_op.cc
+++ b/tensorflow/core/kernels/summary_op.cc
@@ -52,7 +52,7 @@ class SummaryScalarOp : public OpKernel {
    for (int i = 0; i < Ttags.size(); i++) {
      Summary::Value* v = s.add_value();
      v->set_tag(Ttags(i));
-      v->set_simple_value(T(Tvalues(i)));
+      v->set_simple_value(float(Tvalues(i)));
    }

    Tensor* summary_tensor = nullptr;
@@ -87,7 +87,7 @@ class SummaryHistoOp : public OpKernel {
    histogram::Histogram histo;
    for (int64 i = 0; i < flat.size(); i++) {
      T v = flat(i);
-      if (!std::isfinite(v)) {
+      if (!Eigen::numext::isfinite(v)) {
        c->SetStatus(
            errors::OutOfRange("Nan in summary histogram for: ", name()));
        break;

--- a/tensorflow/core/util/saved_tensor_slice_util.h
+++ b/tensorflow/core/util/saved_tensor_slice_util.h
@@ -121,6 +121,8 @@ TENSOR_PROTO_EXTRACT_TYPE(quint8, int, int32);
 #undef TENSOR_PROTO_EXTRACT_TYPE_HELPER
 #undef TENSOR_PROTO_EXTRACT_TYPE

+// Custom implementation for qint32, based on the one for int32.
+
 template <>
 struct SaveTypeTraits<qint32> : SaveTypeTraits<int32> {};

@@ -137,6 +139,37 @@ inline void Fill(const qint32* data, size_t n, TensorProto* t) {
  t->mutable_int_val()->Swap(&copy);
 }

+// Custom implementation for Eigen::half.
+
+template <>
+struct SaveTypeTraits<Eigen::half> {
+  static constexpr bool supported = true;
+  typedef int SavedType;
+  typedef protobuf::RepeatedField<int32> RepeatedField;
+};
+
+template <>
+inline const int* TensorProtoData<Eigen::half>(const TensorProto& t) {
+  return t.half_val().data();
+}
+
+template <>
+inline protobuf::RepeatedField<int32>* MutableTensorProtoData<Eigen::half>(
+    TensorProto* t) {
+  return t->mutable_half_val();
+}
+
+template <>
+inline void Fill(const Eigen::half* data, size_t n, TensorProto* t) {
+  typename protobuf::RepeatedField<int32>* val = t->mutable_half_val();
+  val->Resize(n, 0);
+  for (size_t i = 0; i < n; ++i) {
+    val->Set(i, data[i].x);
+  }
+}
+
+// Custom implementation for string.
+
 template <>
 struct SaveTypeTraits<string> {
  static constexpr bool supported = true;

--- a/tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_kernel_2.cc
+++ b/tensorflow/g3doc/how_tos/adding_an_op/zero_out_op_kernel_2.cc
@@ -70,7 +70,7 @@ class ZeroOutOp : public OpKernel {
    // Set all the elements of the output tensor to 0
    const int N = input.size();
    for (int i = 0; i < N; i++) {
-      output_flat(i) = 0;
+      output_flat(i) = T(0);
    }

    // Preserve the first input value