Implement apply gradients for keras optimizer v2.

PiperOrigin-RevId: 219189655

Implement apply gradients for keras optimizer v2.
PiperOrigin-RevId: 219189655
98fb3b92 · Zhenyu Tan · TensorFlower Gardener · d09c6878 · 98fb3b92 · 98fb3b92
7 changed file
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -471,6 +471,28 @@ cuda_py_test(
    ],
 )

+cuda_py_test(
+    name = "keras_optimizer_v2_test",
+    srcs = ["keras_optimizer_v2_test.py"],
+    additional_deps = [
+        ":combinations",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/optimizer_v2:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
 cuda_py_test(
    name = "estimator_training_test",
    size = "large",

--- a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that show that DistributionStrategy works with canned Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator import training
+from tensorflow.python.estimator.canned import dnn_linear_combined
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+
+
+class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def dataset_input_fn(self, x, y, batch_size):
+
+    def input_fn():
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      dataset = dataset.repeat(1).batch(batch_size)
+      return dataset
+
+    return input_fn
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          distribution=[
+              combinations.one_device_strategy,
+              combinations.mirrored_strategy_with_gpu_and_cpu,
+              combinations.mirrored_strategy_with_two_gpus
+          ],
+          use_train_and_evaluate=[True, False]))
+  def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
+    label_dimension = 2
+    input_dimension = label_dimension
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+    train_input_fn = self.dataset_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size // len(distribution.worker_devices))
+    eval_input_fn = self.dataset_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size // len(distribution.worker_devices))
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, batch_size=batch_size, shuffle=False)
+
+    linear_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))
+    ]
+    dnn_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))
+    ]
+    feature_columns = linear_feature_columns + dnn_feature_columns
+    session_config = config_pb2.ConfigProto(
+        log_device_placement=True, allow_soft_placement=True)
+    estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
+        linear_feature_columns=linear_feature_columns,
+        dnn_hidden_units=(2, 2),
+        dnn_feature_columns=dnn_feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir,
+        dnn_optimizer=adam.Adam(0.001),
+        linear_optimizer=adam.Adam(0.001),
+        config=run_config.RunConfig(
+            train_distribute=distribution,
+            eval_distribute=distribution,
+            session_config=session_config))
+
+    num_steps = 2
+    if use_train_and_evaluate:
+      scores, _ = training.train_and_evaluate(
+          estimator, training.TrainSpec(train_input_fn, max_steps=num_steps),
+          training.EvalSpec(eval_input_fn))
+    else:
+      estimator.train(train_input_fn, steps=num_steps)
+      scores = estimator.evaluate(eval_input_fn)
+
+    self.assertIn('loss', six.iterkeys(scores))
+
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in estimator.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
+                                             serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+
+class MirroredStrategyOptimizerV2Test(test.TestCase):
+
+  def testKerasOptimizerWithUnequalInput(self):
+    if context.num_gpus() < 1:
+      self.skipTest('Not enough GPUs.')
+
+    def create_fn(device_id):
+      var = variables.Variable(
+          2.0, name='var', aggregation=variable_scope.VariableAggregation.SUM)
+      # grad for cpu is 1, grad for gpu is 2, avg grad is 1.5.
+      loss = (device_id + 1) * var
+      optimizer = adam.Adam(learning_rate=0.01, beta_1=0.2, beta_2=0.2)
+      train_op = optimizer.minimize(loss, var_list=[var])
+      m = optimizer.get_slot(var, 'm')
+      v = optimizer.get_slot(var, 'v')
+      return (var, m, v, train_op, optimizer.iteration)
+
+    devices = ['/device:GPU:0', '/device:CPU:0']
+    dist = mirrored_strategy.MirroredStrategy(devices)
+    with dist.scope():
+      (var, m, v, op, counter) = dist.call_for_each_replica(
+          create_fn, dist.worker_device_index, run_concurrently=False)
+      self.evaluate(variables.global_variables_initializer())
+      var_val = [2.0, 2.0, 2.0]
+      self.assertAllClose(
+          var_val,
+          self.evaluate(
+              [dist.read_var(var),
+               var.get(devices[0]),
+               var.get(devices[1])]))
+      self.assertAllClose([0, 0, 0],
+                          self.evaluate([
+                              dist.read_var(counter),
+                              counter.get(devices[0]),
+                              counter.get(devices[1])
+                          ]))
+
+      train_op = dist.unwrap(op)
+      self.evaluate(train_op)
+      # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2
+      m_val = [1.2, 1.2, 1.2]
+      # assert slot variables in both replicas are the same.
+      self.assertAllClose(
+          m_val,
+          self.evaluate(
+              [dist.read_var(m),
+               m.get(devices[0]),
+               m.get(devices[1])]))
+      # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
+      v_val = [1.8, 1.8, 1.8]
+      self.assertAllClose(
+          v_val,
+          self.evaluate(
+              [dist.read_var(v),
+               v.get(devices[0]),
+               v.get(devices[1])]))
+      # var(1) = var(0) - lr * m(1) * sqrt(1 - beta2) / sqrt(v(1)) / (1 - beta1)
+      #        = 2.0 - 0.01 * 1.2 * sqrt(0.8) / sqrt(1.8) / 0.8
+      var_val = [1.99, 1.99, 1.99]
+      self.assertAllClose(
+          var_val,
+          self.evaluate(
+              [dist.read_var(var),
+               var.get(devices[0]),
+               var.get(devices[1])]))
+      self.assertAllClose([1, 1, 1],
+                          self.evaluate([
+                              dist.read_var(counter),
+                              counter.get(devices[0]),
+                              counter.get(devices[1])
+                          ]))
+
+      self.evaluate(train_op)
+      # m(2) = beta1 * m(1) + (1-beta1) * grad = 0.2 * 1.2 + 0.8 * 1.5
+      m_val = [1.44, 1.44, 1.44]
+      self.assertAllClose(
+          m_val,
+          self.evaluate(
+              [dist.read_var(m),
+               m.get(devices[0]),
+               m.get(devices[1])]))
+      # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
+      v_val = [2.16, 2.16, 2.16]
+      self.assertAllClose(
+          v_val,
+          self.evaluate(
+              [dist.read_var(v),
+               v.get(devices[0]),
+               v.get(devices[1])]))
+      self.assertAllClose([2, 2, 2],
+                          self.evaluate([
+                              dist.read_var(counter),
+                              counter.get(devices[0]),
+                              counter.get(devices[1])
+                          ]))
+
+
+if __name__ == '__main__':
+  test.main()
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -12,6 +12,7 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 py_library(
    name = "optimizer_v2",
    srcs = [
+        "adam.py",
        "gradient_descent.py",
        "optimizer_v2.py",
    ],

--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Adam for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+
+
+class Adam(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Adam algorithm.
+
+  Adam optimization is a stochastic gradient descent method that is based on
+  adaptive estimation of first-order and second-order moments. According to the
+  reference, the method is 'computationally efficient, has little memory
+  requirement, invariant to diagonal rescaling of gradients, and is well suited
+  for problems that are large in terms of data/parameters'.
+
+  # References
+      See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+        ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-8,
+               name='Adam'):
+    r"""Construct a new Adam optimizer.
+
+    Initialization:
+
+    $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+    $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+    $$t := 0 \text{(Initialize timestep)}$$
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section2 of the paper:
+
+    $$t := t + 1$$
+    $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+
+    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+
+    The default value of 1e-8 for epsilon might not be a good default in
+    general. For example, when training an Inception network on ImageNet a
+    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta_1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta_2: A float value or a constant float tensor. The exponential decay
+        rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability. This epsilon is
+        "epsilon hat" in the Kingma and Ba paper (in the formula just before
+        Section 2.1), not the epsilon in Algorithm 1 of the paper.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adam".  @compatibility(eager) When eager execution is
+        enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
+        a callable that takes no arguments and returns the actual value to use.
+        This can be useful for changing these values across different
+        invocations of optimizer functions. @end_compatibility
+    """
+
+    super(Adam, self).__init__(name)
+    self._lr = learning_rate
+    self._beta_1 = beta_1
+    self._beta_2 = beta_2
+    self._epsilon = epsilon
+
+  def _create_slots(self, var_list):
+    # Create slots for the first and second moments.
+    for var in var_list:
+      self.add_slot(var, 'm')
+      self.add_slot(var, 'v')
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    # TODO(tanzheny): let optimizer have its own step counter, and let
+    # beta1_power and beta2_power depend on it.
+    return training_ops.resource_apply_adam(
+        var.handle,
+        m.handle,
+        v.handle,
+        math_ops.cast(self._beta_1, grad.dtype.base_dtype),
+        math_ops.cast(self._beta_2, grad.dtype.base_dtype),
+        math_ops.cast(self._lr, grad.dtype.base_dtype),
+        math_ops.cast(self._beta_1, grad.dtype.base_dtype),
+        math_ops.cast(self._beta_2, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon, grad.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -64,13 +64,13 @@ class GradientDescentOptimizerTest(test.TestCase):
        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        sgd_op = gradient_descent.SGD(3.0).apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
+        sgd = gradient_descent.SGD(3.0)
+        sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
        # TODO(apassos) calling initialize_resources on all resources here
        # doesn't work because the sessions and graph are reused across unit
        # tests and this would mean trying to reinitialize variables. Figure out
        # a long-term solution for this.
-        resources.initialize_resources([var0, var1]).run()
+        resources.initialize_resources([var0, var1, sgd.iteration]).run()
        # Fetch params to validate initial values
        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
@@ -90,13 +90,13 @@ class GradientDescentOptimizerTest(test.TestCase):
        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
        lr = lambda: 3.0
-        sgd_op = gradient_descent.SGD(lr).apply_gradients(
-            zip([grads0, grads1], [var0, var1]))
+        sgd = gradient_descent.SGD(lr)
+        sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
        # TODO(apassos) calling initialize_resources on all resources here
        # doesn't work because the sessions and graph are reused across unit
        # tests and this would mean trying to reinitialize variables. Figure out
        # a long-term solution for this.
-        resources.initialize_resources([var0, var1]).run()
+        resources.initialize_resources([var0, var1, sgd.iteration]).run()
        # Fetch params to validate initial values
        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
@@ -116,12 +116,13 @@ class GradientDescentOptimizerTest(test.TestCase):
        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
        pred = math_ops.matmul(var0, x) + var1
        loss = pred * pred
-        sgd_op = gradient_descent.SGD(1.0).minimize(loss)
+        sgd = gradient_descent.SGD(1.0)
+        sgd_op = sgd.minimize(loss, [var0, var1])
        # TODO(apassos) calling initialize_resources on all resources here
        # doesn't work because the sessions and graph are reused across unit
        # tests and this would mean trying to reinitialize variables. Figure out
        # a long-term solution for this.
-        resources.initialize_resources([var0, var1]).run()
+        resources.initialize_resources([var0, var1, sgd.iteration]).run()
        # Fetch params to validate initial values
        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
        self.assertAllCloseAccordingToType([3.0], var1.eval())
@@ -143,7 +144,7 @@ class GradientDescentOptimizerTest(test.TestCase):
        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
        pred += var1
        loss = pred * pred
-        sgd_op = gradient_descent.SGD(1.0).minimize(loss)
+        sgd_op = gradient_descent.SGD(1.0).minimize(loss, [var0, var1])
        variables.global_variables_initializer().run()
        # Fetch params to validate initial values
        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
@@ -193,25 +194,24 @@ class GradientDescentOptimizerTest(test.TestCase):
  def testWithGlobalStep(self):
    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
      with self.cached_session():
-        global_step = variables.Variable(0, trainable=False)
        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        sgd_op = gradient_descent.SGD(3.0).apply_gradients(
-            zip([grads0, grads1], [var0, var1]), global_step=global_step)
+        sgd = gradient_descent.SGD(3.0)
+        sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
        variables.global_variables_initializer().run()
        # Fetch params to validate initial values
        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
        # Run 1 step of sgd
        sgd_op.run()
-        # Validate updated params and global_step
+        # Validate updated params and optimizer iterations.
        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
                                           var0.eval())
        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
                                           var1.eval())
-        self.assertAllCloseAccordingToType(1, global_step.eval())
+        self.assertAllCloseAccordingToType(1, sgd.iteration.eval())

  def testSparseBasic(self):
    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:

--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,7 +22,20 @@ from __future__ import print_function

 import abc

+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.util import nest


 class OptimizerV2(optimizer_v1.Optimizer):
@@ -77,29 +90,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
  opt.apply_gradients(capped_grads_and_vars)
  ```

-  ### Gating Gradients
-
-  Both `minimize()` and `compute_gradients()` accept a `gate_gradients`
-  argument that controls the degree of parallelism during the application of
-  the gradients.
-
-  The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.
-
-  <b>`GATE_NONE`</b>: Compute and apply gradients in parallel.  This provides
-  the maximum parallelism in execution, at the cost of some non-reproducibility
-  in the results.  For example the two gradients of `matmul` depend on the input
-  values: With `GATE_NONE` one of the gradients could be applied to one of the
-  inputs _before_ the other gradient is computed resulting in non-reproducible
-  results.
-
-  <b>`GATE_OP`</b>: For each Op, make sure all gradients are computed before
-  they are used.  This prevents race conditions for Ops that generate gradients
-  for multiple inputs where the gradients depend on the inputs.
-
-  <b>`GATE_GRAPH`</b>: Make sure all gradients for all variables are computed
-  before any one of them is used.  This provides the least parallelism but can
-  be useful if you want to process all gradients before applying any of them.
-
  ### Slots

  Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
@@ -111,11 +101,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
  This can be useful if you want to log debug a training algorithm, report stats
  about the slots, etc.

-  ### Non-slot variables
-
-  Some optimizer subclasses, such as `AdamOptimizer` have variables that
-  are not associated with the variables to train, just the step itself.
-
  ### Hyper parameters

  These are arguments passed to the optimizer subclass constructor
@@ -124,18 +109,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
  callables. If they are callable, the callable will be called during
  `apply_gradients()` to get the value for the hyper parameter.

-  ### State
-
-  Internal methods are passed a `state` argument with the correct
-  values to use for the slot and non-slot variables, and the hyper
-  parameters.
  """

-  # Values for gate_gradients.
-  GATE_NONE = 0
-  GATE_OP = 1
-  GATE_GRAPH = 2
-
  def __init__(self, name):
    """Create a new Optimizer.

@@ -145,6 +120,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
    you should be able to use the _set_hyper()/state.get_hyper()
    facility instead.

+    This class in stateful and thread-compatible.
+
    Args:
      name: A non-empty string.  The name to use for accumulators created
        for the optimizer.
@@ -157,6 +134,192 @@ class OptimizerV2(optimizer_v1.Optimizer):
    self._use_locking = True
    super(OptimizerV2, self).__init__(self._use_locking, name)
    self._hyper = {}
+    self._slots = {}
+    self._prepared = False
+
+  def minimize(self,
+               loss,
+               var_list,
+               aggregation_method=None,
+               colocate_gradients_with_ops=False,
+               name=None,
+               grad_loss=None):
+    """Add operations to minimize `loss` by updating `var_list`.
+
+    This method simply combines calls `compute_gradients()` and
+    `apply_gradients()`. If you want to process the gradient before applying
+    them call `compute_gradients()` and `apply_gradients()` explicitly instead
+    of using this function.
+
+    Args:
+      loss: A `Tensor` containing the value to minimize.
+      var_list: list or tuple of `Variable` objects to update to minimize
+        `loss`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with the
+        corresponding op.
+      name: Optional name for the returned operation.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+
+    Returns:
+      An Operation that updates the variables in `var_list`.  If `global_step`
+      was not `None`, that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If some of the variables are not `Variable` objects.
+
+    @compatibility(eager)
+    When eager execution is enabled, `loss` should be a Python function that
+    takes no arguments and computes the value to be minimized. Minimization (and
+    gradient computation) is done with respect to the elements of `var_list` if
+    not None, else with respect to any trainable variables created during the
+    execution of the `loss` function. `gate_gradients`, `aggregation_method`,
+    `colocate_gradients_with_ops` and `grad_loss` are ignored when eager
+    execution is enabled.
+    @end_compatibility
+    """
+    grads_and_vars = self.compute_gradients(
+        loss,
+        var_list=var_list,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        grad_loss=grad_loss)
+
+    return self.apply_gradients(grads_and_vars, name=name)
+
+  def compute_gradients(self,
+                        loss,
+                        var_list,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None,
+                        stop_gradients=None):
+    """Compute gradients of `loss` for the variables in `var_list`.
+
+    This is the first part of `minimize()`.  It returns a list
+    of (gradient, variable) pairs where "gradient" is the gradient
+    for "variable".  Note that "gradient" can be a `Tensor`, an
+    `IndexedSlices`, or `None` if there is no gradient for the
+    given variable.
+
+    Args:
+      loss: A Tensor containing the value to minimize or a callable taking no
+        arguments which returns the value to minimize. When eager execution is
+        enabled it must be a callable.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph under
+        the key `GraphKeys.TRAINABLE_VARIABLES`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with the
+        corresponding op.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
+        through.
+
+    Returns:
+      A list of (gradient, variable) pairs. Variable is always present, but
+      gradient can be `None`.
+
+    Raises:
+      TypeError: If `var_list` contains anything else than `Variable` objects.
+      ValueError: If some arguments are invalid, or var_list is None.
+      RuntimeError: If called with eager execution enabled and `loss` is
+        not callable.
+
+    @compatibility(eager)
+    When eager execution is enabled, `aggregation_method`, and
+    `colocate_gradients_with_ops` are ignored.
+    @end_compatibility
+    """
+    var_list = nest.flatten(var_list)
+    # TODO(josh11b): Test that we handle weight decay in a reasonable way.
+    if callable(loss):
+      with backprop.GradientTape() as tape:
+        tape.watch(var_list)
+        loss_value = loss()
+      grads = tape.gradient(loss_value, var_list, grad_loss)
+    else:
+      if context.executing_eagerly():
+        raise RuntimeError("`loss` passed to Optimizer.compute_gradients "
+                           "should be a function when eager execution is "
+                           "enabled.")
+      self._assert_valid_dtypes([loss])
+      if grad_loss is not None:
+        self._assert_valid_dtypes([grad_loss])
+      grads = gradients.gradients(
+          loss,
+          var_list,
+          grad_ys=grad_loss,
+          aggregation_method=aggregation_method,
+          colocate_gradients_with_ops=colocate_gradients_with_ops,
+          stop_gradients=stop_gradients)
+
+    grads_and_vars = list(zip(grads, var_list))
+    self._assert_valid_dtypes([
+        v for g, v in grads_and_vars
+        if g is not None and v.dtype != dtypes.resource
+    ])
+
+    return grads_and_vars
+
+  def apply_gradients(self, grads_and_vars, name=None):
+    """Apply gradients to variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      name: Optional name for the returned operation.  Default to the name
+        passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    grads_and_vars = _filter_grads(grads_and_vars)
+    var_list = [v for (_, v) in grads_and_vars]
+    if distribution_strategy_context.has_distribution_strategy():
+      reduced_grads = merge_grads(grads_and_vars)
+      grads_and_vars = zip(reduced_grads, var_list)
+
+    with ops.init_scope():
+      self._create_slots(var_list)
+    update_ops = []
+
+    def update_grad_to_var(grad, var):
+      """Apply gradient to variable."""
+      if isinstance(var, ops.Tensor):
+        raise NotImplementedError("Trying to update a Tensor ", var)
+      if isinstance(grad, ops.IndexedSlices):
+        if var.constraint is not None:
+          raise RuntimeError(
+              "Cannot use a constraint function on a sparse variable.")
+        return self._resource_apply_sparse_duplicate_indices(
+            grad.values, var, grad.indices)
+      update_op = self._resource_apply_dense(grad, var)
+      if var.constraint is not None:
+        with ops.control_dependencies([update_op]):
+          return var.assign(var.constraint(var))
+      else:
+        return update_op
+
+    with ops.name_scope(name, self._name) as name:
+      self._prepare()
+      for grad, var in grads_and_vars:
+        scope_name = "" if in_eager_execution() else "_" + var.op.name
+        with ops.name_scope("update" + scope_name), ops.colocate_with(var):
+          update_ops.append(update_grad_to_var(grad, var))
+      with ops.colocate_with(self._iterations):
+        update_ops.append(self._iterations.assign_add(1))
+      return control_flow_ops.group(*update_ops)

  def _set_hyper(self, name, value):
    self._hyper[name] = value
@@ -166,8 +329,33 @@ class OptimizerV2(optimizer_v1.Optimizer):
    value = self._hyper[name]
    return self._call_if_callable(value)

+  def add_slot(self, var, slot_name):
+    slot_key = _get_slot_key_from_var(var, slot_name)
+    if slot_key not in self._slots:
+      self._slots[slot_key] = self.add_weight(
+          name=slot_key, shape=var.shape, dtype=var.dtype)
+
+  def get_slot(self, var, slot_name):
+    slot_key = _get_slot_key_from_var(var, slot_name)
+    return self._slots[slot_key]
+
  def _prepare(self):
-    pass
+    if self._prepared:
+      return
+    # This is where all hyper variables will be created.
+    with ops.device("cpu:0"):
+      self._iterations = self.add_weight(
+          self._name + "/iter",
+          shape=[],
+          trainable=False,
+          aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
+    self._prepared = True
+
+  @property
+  def iteration(self):
+    if not self._prepared:
+      self._prepare()
+    return self._iterations

  @abc.abstractmethod
  def get_config(self):
@@ -205,3 +393,116 @@ class OptimizerV2(optimizer_v1.Optimizer):
  def _serialize_hyperparameter(self, hyperparameter_name):
    """Serialize a hyperparameter that can be a float, callable, or Tensor."""
    return self._hyper[hyperparameter_name]
+
+  def add_weight(self,
+                 name,
+                 shape,
+                 dtype=None,
+                 initializer="zeros",
+                 trainable=None,
+                 synchronization=variables.VariableSynchronization.AUTO,
+                 aggregation=variables.VariableAggregation.NONE):
+
+    if dtype is None:
+      dtype = dtypes.float32
+    initializer = initializers.get(initializer)
+
+    if synchronization == variables.VariableSynchronization.ON_READ:
+      if trainable:
+        raise ValueError(
+            "Synchronization value can be set to "
+            "VariableSynchronization.ON_READ only for non-trainable variables. "
+            "You have specified trainable=True and "
+            "synchronization=VariableSynchronization.ON_READ.")
+      else:
+        # Set trainable to be false when variable is to be synced on read.
+        trainable = False
+    elif trainable is None:
+      trainable = True
+
+    variable = self._add_variable_with_custom_getter(
+        name=name,
+        shape=shape,
+        getter=base_layer.make_variable,
+        overwrite=True,
+        initializer=initializers.get(initializer),
+        dtype=dtype,
+        trainable=trainable,
+        use_resource=True,
+        synchronization=synchronization,
+        aggregation=aggregation)
+
+    return variable
+
+
+def _filter_grads(grads_and_vars):
+  """Filter out iterable with grad equal to None."""
+  grads_and_vars = tuple(grads_and_vars)
+  if not grads_and_vars:
+    raise ValueError("No variables provided.")
+  filtered = []
+  vars_with_empty_grads = []
+  for grad, var in grads_and_vars:
+    if grad is None:
+      vars_with_empty_grads.append(var)
+    else:
+      filtered.append((grad, var))
+  filtered = tuple(filtered)
+  if not filtered:
+    raise ValueError("No gradients provided for any variable: %s." %
+                     ([v.name for _, v in filtered],))
+  if vars_with_empty_grads:
+    logging.warning(
+        ("Gradients does not exist for variables %s when minimizing the loss."),
+        ([v.name for v in vars_with_empty_grads]))
+  return filtered
+
+
+def merge_grads(grads_and_vars):
+  """Merge gradients from different replicas."""
+
+  def merge_grad_fn(strategy, grads_and_vars):
+    reduced_grads = strategy.batch_reduce(
+        variable_scope.VariableAggregation.MEAN, grads_and_vars)
+    return reduced_grads
+
+  return distribution_strategy_context.get_tower_context().merge_call(
+      merge_grad_fn, grads_and_vars)
+
+
+def in_eager_execution():
+  with ops.init_scope():
+    return context.executing_eagerly()
+
+
+def _get_slot_key_from_var(var, slot_name):
+  """Get the slot key for the variable.
+
+  Scope the slot name in the namespace of the primary variable.
+  Set "primary.op.name + '/' + slot_name" as default name.
+
+  In graph mode the name is derived from the op.
+  In eager mode the name is derived from the var.
+  If distribution strategy exists, then the name is derived from the primary
+  variable instead of replica variable, i.e., /dense/kernel instead of
+  /dense/kernel/replica_1. If the slot name is 'm', then the slot variables
+  being created are /dense/kernel/m and /dense/kernel/m/replica_1, instead of
+  /dense/kernel/replica_1/m/replica_1.
+
+  Args:
+    var: the variable.
+    slot_name: the name of the slot.
+
+  Returns:
+    the name of the variable.
+  """
+
+  # pylint: disable=protected-access
+  if distribution_strategy_context.has_distribution_strategy() and hasattr(
+      var, "_primary_var"):
+    var = var._primary_var
+  if context.executing_eagerly():
+    name = var._shared_name
+  else:
+    name = var.op.name
+  return name + "/" + slot_name
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -36,194 +37,179 @@ class OptimizerTest(test.TestCase):

  @test_util.run_in_graph_and_eager_modes
  def testBasic(self):
-    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-      def loss():
-        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-      # Note that for eager execution, minimize expects a function instead of a
-      # Tensor.
-      global_step = resource_variable_ops.ResourceVariable(
-          array_ops.zeros([], dtypes.int64), name='global_step_%d' % i)
-      sgd_op = gradient_descent.SGD(3.0)
-
-      self.evaluate(variables.global_variables_initializer())
-      # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-      # Run 1 step of sgd through optimizer
-      opt_op = sgd_op.minimize(loss, global_step, [var0, var1])
-      self.evaluate(opt_op)
-      # Validate updated params
-      self.assertAllClose([-14., -13.], self.evaluate(var0))
-      self.assertAllClose([-6., -5.], self.evaluate(var1))
+    for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
+        sgd = gradient_descent.SGD(3.0)
+
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Run 1 step of sgd through optimizer
+        opt_op = sgd.minimize(loss, var_list=[var0, var1])
+        self.evaluate(sgd.iteration.initializer)
+        self.evaluate(opt_op)
+        # Validate updated params
+        self.assertAllClose([-14., -13.], self.evaluate(var0))
+        self.assertAllClose([-6., -5.], self.evaluate(var1))

+  @test_util.run_in_graph_and_eager_modes
  def testAggregationMethod(self):
    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
      with self.cached_session():
        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        cost = 5 * var0 + 3 * var1
-        global_step = variables.Variable(
-            array_ops.zeros([], dtypes.int64), name='global_step')
-        sgd_op = gradient_descent.SGD(3.0)
-        opt_op = sgd_op.minimize(
-            cost,
-            global_step, [var0, var1],
-            aggregation_method=gradients_impl.AggregationMethod.
-            EXPERIMENTAL_ACCUMULATE_N)
+        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
+        sgd = gradient_descent.SGD(3.0)

-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
        # Run 1 step of sgd through optimizer
-        opt_op.run()
+        opt_op = sgd.minimize(
+            loss,
+            var_list=[var0, var1],
+            aggregation_method=gradients_impl.AggregationMethod
+            .EXPERIMENTAL_ACCUMULATE_N)
+        self.evaluate(sgd.iteration.initializer)
+        self.evaluate(opt_op)
        # Validate updated params
-        self.assertAllClose([-14., -13.], var0.eval())
-        self.assertAllClose([-6., -5.], var1.eval())
+        self.assertAllClose([-14., -13.], self.evaluate(var0))
+        self.assertAllClose([-6., -5.], self.evaluate(var1))

+  @test_util.run_in_graph_and_eager_modes
  def testPrecomputedGradient(self):
    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
      with self.cached_session():
        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        cost = 5 * var0 + 3 * var1
+        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
        grad_loss = constant_op.constant([42, -42], dtype=dtype)
-        global_step = variables.Variable(
-            array_ops.zeros([], dtypes.int64), name='global_step')
-        sgd_op = gradient_descent.SGD(3.0)
-        opt_op = sgd_op.minimize(
-            cost, global_step, [var0, var1], grad_loss=grad_loss)
+        sgd = gradient_descent.SGD(3.0)

-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
        # Run 1 step of sgd through optimizer
-        opt_op.run()
+        opt_op = sgd.minimize(loss, var_list=[var0, var1], grad_loss=grad_loss)
+        self.evaluate(sgd.iteration.initializer)
+        self.evaluate(opt_op)
        # Validate updated params
        self.assertAllClose([1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
-                            var0.eval())
+                            self.evaluate(var0))
        self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
-                            var1.eval())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testNoVariables(self):
-    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      # pylint: disable=cell-var-from-loop
-      def loss():
-        var0 = resource_variable_ops.ResourceVariable(
-            [1.0, 2.0], dtype=dtype, trainable=False, name='a')
-        var1 = resource_variable_ops.ResourceVariable(
-            [3.0, 4.0], dtype=dtype, trainable=False, name='b')
-        return 5 * var0 + var1
-      # pylint: enable=cell-var-from-loop
-      sgd_op = gradient_descent.SGD(3.0)
-      with self.assertRaisesRegexp(ValueError, 'No.*variables'):
-        sgd_op.minimize(loss)
+                            self.evaluate(var1))

  @test_util.run_in_graph_and_eager_modes
  def testNoGradients(self):
    for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-      # pylint: disable=cell-var-from-loop
-      def loss():
-        return 5 * var0
-      # pylint: enable=cell-var-from-loop
-      sgd_op = gradient_descent.SGD(3.0)
-      with self.assertRaisesRegexp(ValueError, 'No gradients'):
-        # var1 has no gradient
-        sgd_op.minimize(loss, var_list=[var1])
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        loss = lambda: 5 * var0  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
+        sgd_op = gradient_descent.SGD(3.0)
+        with self.assertRaisesRegexp(ValueError, 'No gradients'):
+          # var1 has no gradient
+          sgd_op.minimize(loss, var_list=[var1])

  @test_util.run_in_graph_and_eager_modes
  def testNoGradientsForAnyVariables_Minimize(self):
    for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-      def loss():
-        return constant_op.constant(5.0)
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        loss = lambda: constant_op.constant(5.0)
+        if not context.executing_eagerly():
+          loss = loss()

-      sgd_op = gradient_descent.SGD(3.0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'No gradients provided for any variable'):
-        sgd_op.minimize(loss, var_list=[var0, var1])
+        sgd_op = gradient_descent.SGD(3.0)
+        with self.assertRaisesRegexp(ValueError,
+                                     'No gradients provided for any variable'):
+          sgd_op.minimize(loss, var_list=[var0, var1])

  @test_util.run_in_graph_and_eager_modes
  def testNoGradientsForAnyVariables_ApplyGradients(self):
    for _, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-      sgd_op = gradient_descent.SGD(3.0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'No gradients provided for any variable'):
-        sgd_op.apply_gradients([(None, var0), (None, var1)])
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        sgd_op = gradient_descent.SGD(3.0)
+        with self.assertRaisesRegexp(ValueError,
+                                     'No gradients provided for any variable'):
+          sgd_op.apply_gradients([(None, var0), (None, var1)])

  @test_util.run_in_graph_and_eager_modes
  def testGradientsAsVariables(self):
    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-      def loss():
-        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
-
-      sgd_op = gradient_descent.SGD(3.0)
-      grads_and_vars = sgd_op.compute_gradients(loss, [var0, var1])
-      # Convert gradients to tf.Variables
-      converted_grads = [
-          resource_variable_ops.ResourceVariable(array_ops.zeros([2], dtype),
-                                                 name='c_%d_%d' % (i, j))
-          for j, gv in enumerate(grads_and_vars)
-      ]
-      convert_ops = [
-          state_ops.assign(converted_grads[j], gv[0])
-          for j, gv in enumerate(grads_and_vars)
-      ]
-
-      self.evaluate(variables.global_variables_initializer())
-      # Run convert_ops to achieve the gradietns converting
-      self.evaluate(convert_ops)
-      # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        loss = lambda: 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+        if not context.executing_eagerly():
+          loss = loss()
+
+        sgd = gradient_descent.SGD(3.0)
+        grads_and_vars = sgd.compute_gradients(loss, [var0, var1])
+        # Convert gradients to tf.Variables
+        converted_grads = [
+            resource_variable_ops.ResourceVariable(
+                array_ops.zeros([2], dtype), name='c_%d_%d' % (i, j))
+            for j, gv in enumerate(grads_and_vars)
+        ]
+        convert_ops = [
+            state_ops.assign(converted_grads[j], gv[0])
+            for j, gv in enumerate(grads_and_vars)
+        ]
+
+        # Run convert_ops to achieve the gradients converting
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(convert_ops)
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))

-      # Run 1 step of sgd through optimizer
-      converted_grads_and_vars = list(zip(converted_grads, [var0, var1]))
-      opt_op = sgd_op.apply_gradients(converted_grads_and_vars)
-      self.evaluate(opt_op)
+        # Run 1 step of sgd through optimizer
+        converted_grads_and_vars = list(zip(converted_grads, [var0, var1]))
+        opt_op = sgd.apply_gradients(converted_grads_and_vars)
+        self.evaluate(sgd.iteration.initializer)
+        self.evaluate(opt_op)

-      # Validate updated params
-      self.assertAllClose([-14., -13.], self.evaluate(var0))
-      self.assertAllClose([-6., -5.], self.evaluate(var1))
+        # Validate updated params
+        self.assertAllClose([-14., -13.], self.evaluate(var0))
+        self.assertAllClose([-6., -5.], self.evaluate(var1))

  @test_util.run_in_graph_and_eager_modes
  def testComputeGradientsWithTensors(self):
-    x = ops.convert_to_tensor(1.0)
-    def f():
-      return x * x
+    with self.cached_session():
+      x = ops.convert_to_tensor(1.0)

-    sgd_op = gradient_descent.SGD(3.0)
-    grads_and_vars = sgd_op.compute_gradients(f, [x])
-    self.assertEqual(1, len(grads_and_vars))
-    grad, x_as_var = grads_and_vars[0]
-    self.assertIs(x, x_as_var)
-    self.assertEqual(2.0, self.evaluate(grad))
+      def f():
+        return x * x

-    with self.assertRaises(NotImplementedError):
-      sgd_op.apply_gradients(grads_and_vars)
+      sgd = gradient_descent.SGD(3.0)
+      grads_and_vars = sgd.compute_gradients(f, [x])
+      self.assertEqual(1, len(grads_and_vars))
+      grad, x_as_var = grads_and_vars[0]
+      self.assertIs(x, x_as_var)
+      self.assertEqual(2.0, self.evaluate(grad))

-  def testTrainOp(self):
-    with self.cached_session():
-      var0 = variables.Variable([1.0, 2.0])
-      var1 = variables.Variable([3.0, 4.0])
-      cost = 5 * var0 + 3 * var1
-      global_step = variables.Variable(
-          array_ops.zeros([], dtypes.int64), name='global_step')
-      sgd_op = gradient_descent.SGD(3.0)
-      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
-      self.assertTrue(opt_op in ops.get_collection(ops.GraphKeys.TRAIN_OP))
+      with self.assertRaises(NotImplementedError):
+        sgd.apply_gradients(grads_and_vars)

+  @test_util.run_in_graph_and_eager_modes
  def testConstraint(self):
    constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
    constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
@@ -232,21 +218,29 @@ class OptimizerTest(test.TestCase):
                                constraint=constraint_01)
      var1 = variables.Variable([3.0, 4.0],
                                constraint=constraint_0)
-      cost = 5 * var0 + 3 * var1
-      global_step = variables.Variable(
-          array_ops.zeros([], dtypes.int64), name='global_step')
-      sgd_op = gradient_descent.SGD(3.0)
-      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
+      loss = lambda: 5 * var0 + 3 * var1
+      if not context.executing_eagerly():  # pylint: disable=cell-var-from-loop
+        loss = loss()
+      sgd = gradient_descent.SGD(3.0)

-      variables.global_variables_initializer().run()
+      self.evaluate(variables.global_variables_initializer())
      # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([3.0, 4.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
      # Run 1 step of sgd through optimizer
-      opt_op.run()
+      opt_op = sgd.minimize(loss, var_list=[var0, var1])
+      self.evaluate(sgd.iteration.initializer)
+      self.evaluate(opt_op)
      # Validate updated params
-      self.assertAllClose([-0.1, -0.1], var0.eval())
-      self.assertAllClose([0., 0.], var1.eval())
+      self.assertAllClose([-0.1, -0.1], self.evaluate(var0))
+      self.assertAllClose([0., 0.], self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testIterationWithoutMinimize(self):
+    with self.cached_session():
+      sgd = gradient_descent.SGD(3.0)
+      self.evaluate(sgd.iteration.initializer)
+      self.assertEqual(0, self.evaluate(sgd.iteration))


 if __name__ == '__main__':