rough pass at carving out existing NCF pipeline

2nd half of rough replacement pass fix dataset map functions reduce bias in sample selection cache pandas work on a daily basis cleanup and fix batch check for multi gpu multi device fix fix treatment of eval data padding print data producer replace epoch overlap with padding and masking move type and shape info into the producer class and update run.sh with larger batch size hyperparams remove xla for multi GPU more cleanup remove model runner altogether bug fixes address subtle pipeline hang and improve producer __repr__ fix crash fix assert use popen_helper to create pools add StreamingFilesDataset and abstract data storage to a separate class bug fix fix wait bug and add manual stack trace print more bug fixes and refactor valid point mask to work with TPU sharding misc bug fixes and adjust dtypes address crash from decoding bools fix remaining dtypes and change record writer pattern since it does not append fix synthetic data use TPUStrategy instead of TPUEstimator minor tweaks around moving to TPUStrategy cleanup some old code delint and simplify permutation generation remove low level tf layer definition, use single table with slice for keras, and misc fixes missed minor point on removing tf layer definition fix several bugs from recombinging layer definitions delint and add docstrings Update ncf_test.py. Section for identical inputs and different outputs was removed. update data test to run against the new producer class

rough pass at carving out existing NCF pipeline
2nd half of rough replacement pass fix dataset map functions reduce bias in sample selection cache pandas work on a daily basis cleanup and fix batch check for multi gpu multi device fix fix treatment of eval data padding print data producer replace epoch overlap with padding and masking move type and shape info into the producer class and update run.sh with larger batch size hyperparams remove xla for multi GPU more cleanup remove model runner altogether bug fixes address subtle pipeline hang and improve producer __repr__ fix crash fix assert use popen_helper to create pools add StreamingFilesDataset and abstract data storage to a separate class bug fix fix wait bug and add manual stack trace print more bug fixes and refactor valid point mask to work with TPU sharding misc bug fixes and adjust dtypes address crash from decoding bools fix remaining dtypes and change record writer pattern since it does not append fix synthetic data use TPUStrategy instead of TPUEstimator minor tweaks around moving to TPUStrategy cleanup some old code delint and simplify permutation generation remove low level tf layer definition, use single table with slice for keras, and misc fixes missed minor point on removing tf layer definition fix several bugs from recombinging layer definitions delint and add docstrings Update ncf_test.py. Section for identical inputs and different outputs was removed. update data test to run against the new producer class
c5ff4ec7 · Taylor Robie · ae86bfd9 · c5ff4ec7 · ae86bfd9 · c5ff4ec7
14 changed file
--- a/official/recommendation/constants.py
+++ b/official/recommendation/constants.py
@@ -14,36 +14,30 @@
 # ==============================================================================
 """Central location for NCF specific values."""

-import os
-import time
+import sys

+import numpy as np
+
+from official.datasets import movielens

 # ==============================================================================
 # == Main Thread Data Processing ===============================================
 # ==============================================================================
-class Paths(object):
-  """Container for various path information used while training NCF."""
-
-  def __init__(self, data_dir, cache_id=None):
-    self.cache_id = cache_id or int(time.time())
-    self.data_dir = data_dir
-    self.cache_root = os.path.join(
-        self.data_dir, "{}_ncf_recommendation_cache".format(self.cache_id))
-    self.train_shard_subdir = os.path.join(self.cache_root,
-                                           "raw_training_shards")
-    self.train_shard_template = os.path.join(self.train_shard_subdir,
-                                             "positive_shard_{}.pickle")
-    self.train_epoch_dir = os.path.join(self.cache_root, "training_epochs")
-    self.eval_data_subdir = os.path.join(self.cache_root, "eval_data")

-    self.subproc_alive = os.path.join(self.cache_root, "subproc.alive")
+# Keys for data shards
+TRAIN_USER_KEY = "train_{}".format(movielens.USER_COLUMN)
+TRAIN_ITEM_KEY = "train_{}".format(movielens.ITEM_COLUMN)
+TRAIN_LABEL_KEY = "train_labels"
+MASK_START_INDEX = "mask_start_index"
+VALID_POINT_MASK = "valid_point_mask"
+EVAL_USER_KEY = "eval_{}".format(movielens.USER_COLUMN)
+EVAL_ITEM_KEY = "eval_{}".format(movielens.ITEM_COLUMN)

+USER_MAP = "user_map"
+ITEM_MAP = "item_map"

-APPROX_PTS_PER_TRAIN_SHARD = 128000
-
-# Keys for data shards
-TRAIN_KEY = "train"
-EVAL_KEY = "eval"
+USER_DTYPE = np.int32
+ITEM_DTYPE = np.int32

 # In both datasets, each user has at least 20 ratings.
 MIN_NUM_RATINGS = 20
@@ -62,21 +56,24 @@ DUPLICATE_MASK = "duplicate_mask"
 HR_METRIC_NAME = "HR_METRIC"
 NDCG_METRIC_NAME = "NDCG_METRIC"

+# Trying to load a cache created in py2 when running in py3 will cause an
+# error due to differences in unicode handling.
+RAW_CACHE_FILE = "raw_data_cache_py{}.pickle".format(sys.version_info[0])
+CACHE_INVALIDATION_SEC = 3600 * 24
+
 # ==============================================================================
-# == Subprocess Data Generation ================================================
+# == Data Generation ===========================================================
 # ==============================================================================
 CYCLES_TO_BUFFER = 3  # The number of train cycles worth of data to "run ahead"
                      # of the main training loop.

-FLAGFILE_TEMP = "flagfile.temp"
-FLAGFILE = "flagfile"
-READY_FILE_TEMP = "ready.json.temp"
-READY_FILE = "ready.json"
-
-TRAIN_RECORD_TEMPLATE = "train_{}.tfrecords"
-EVAL_RECORD_TEMPLATE = "eval_{}.tfrecords"
+# Number of batches to run per epoch when using synthetic data. At high batch
+# sizes, we run for more batches than with real data, which is good since
+# running more batches reduces noise when measuring the average batches/second.
+SYNTHETIC_BATCHES_PER_EPOCH = 2000

-TIMEOUT_SECONDS = 3600 * 2  # If the train loop goes more than two hours without
-                            # consuming an epoch of data, this is a good
-                            # indicator that the main thread is dead and the
-                            # subprocess is orphaned.
+# Only used when StreamingFilesDataset is used.
+NUM_FILE_SHARDS = 16
+TRAIN_FOLDER_TEMPLATE = "training_cycle_{}"
+EVAL_FOLDER = "eval_data"
+SHARD_TEMPLATE = "shard_{}.tfrecords"
--- a/official/recommendation/data_async_generation.py
+++ b/official/recommendation/data_async_generation.py
--- a/official/recommendation/data_pipeline.py
+++ b/official/recommendation/data_pipeline.py
--- a/official/recommendation/data_preprocessing.py
+++ b/official/recommendation/data_preprocessing.py
--- a/official/recommendation/data_test.py
+++ b/official/recommendation/data_test.py
@@ -18,17 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+from collections import defaultdict
 import os
 import pickle
 import time

 import numpy as np
 import pandas as pd
+import scipy.stats
 import tensorflow as tf

 from official.datasets import movielens
 from official.recommendation import constants as rconst
-from official.recommendation import data_async_generation
 from official.recommendation import data_preprocessing
 from official.recommendation import stat_utils

@@ -65,10 +66,10 @@ class BaseTest(tf.test.TestCase):
    scores = np.random.randint(low=0, high=5, size=NUM_PTS)
    times = np.random.randint(low=1000000000, high=1200000000, size=NUM_PTS)

-    rating_file = os.path.join(ratings_folder, movielens.RATINGS_FILE)
+    self.rating_file = os.path.join(ratings_folder, movielens.RATINGS_FILE)
    self.seen_pairs = set()
    self.holdout = {}
-    with tf.gfile.Open(rating_file, "w") as f:
+    with tf.gfile.Open(self.rating_file, "w") as f:
      f.write("user_id,item_id,rating,timestamp\n")
      for usr, itm, scr, ts in zip(users, items, scores, times):
        pair = (usr, itm)
@@ -85,21 +86,32 @@ class BaseTest(tf.test.TestCase):
    data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[DATASET] = (NUM_USERS,
                                                                  NUM_ITEMS)

+  def make_params(self, train_epochs=1):
+    return {
+        "train_epochs": train_epochs,
+        "batches_per_step": 1,
+        "use_seed": False,
+        "batch_size": BATCH_SIZE,
+        "eval_batch_size": EVAL_BATCH_SIZE,
+        "num_neg": NUM_NEG,
+        "match_mlperf": True,
+        "use_tpu": False,
+        "use_xla_for_gpu": False,
+    }
+
  def test_preprocessing(self):
    # For the most part the necessary checks are performed within
-    # construct_cache()
-    ncf_dataset = data_preprocessing.construct_cache(
-        dataset=DATASET, data_dir=self.temp_data_dir, num_data_readers=2,
-        match_mlperf=False, deterministic=False)
-    assert ncf_dataset.num_users == NUM_USERS
-    assert ncf_dataset.num_items == NUM_ITEMS
-
-    time.sleep(1)  # Ensure we create the next cache in a new directory.
-    ncf_dataset = data_preprocessing.construct_cache(
-        dataset=DATASET, data_dir=self.temp_data_dir, num_data_readers=2,
-        match_mlperf=True, deterministic=False)
-    assert ncf_dataset.num_users == NUM_USERS
-    assert ncf_dataset.num_items == NUM_ITEMS
+    # _filter_index_sort()
+
+    for match_mlperf in [True, False]:
+      cache_path = os.path.join(self.temp_data_dir, "test_cache.pickle")
+      data, valid_cache = data_preprocessing._filter_index_sort(
+          self.rating_file, cache_path=cache_path,
+          match_mlperf=match_mlperf)
+
+      assert len(data[rconst.USER_MAP]) == NUM_USERS
+      assert len(data[rconst.ITEM_MAP]) == NUM_ITEMS
+      assert not valid_cache

  def drain_dataset(self, dataset, g):
    # type: (tf.data.Dataset, tf.Graph) -> list
@@ -115,28 +127,40 @@ class BaseTest(tf.test.TestCase):
    return output

  def test_end_to_end(self):
-    ncf_dataset, _ = data_preprocessing.instantiate_pipeline(
-        dataset=DATASET, data_dir=self.temp_data_dir,
-        batch_size=BATCH_SIZE, eval_batch_size=EVAL_BATCH_SIZE,
-        num_cycles=1, num_data_readers=2, num_neg=NUM_NEG)
+    params = self.make_params(train_epochs=1)
+    _, _, producer = data_preprocessing.instantiate_pipeline(
+        dataset=DATASET, data_dir=self.temp_data_dir, deterministic=False,
+        params=params)
+
+    producer.start()
+    producer.join()
+    assert producer._fatal_exception is None

+    user_inv_map = {v: k for k, v in producer.user_map.items()}
+    item_inv_map = {v: k for k, v in producer.item_map.items()}
+
+    # ==========================================================================
+    # == Training Data =========================================================
+    # ==========================================================================
    g = tf.Graph()
    with g.as_default():
-      input_fn, record_dir, batch_count = \
-        data_preprocessing.make_input_fn(ncf_dataset, True)
-      dataset = input_fn({"batch_size": BATCH_SIZE, "use_tpu": False,
-                          "use_xla_for_gpu": False})
+      input_fn = producer.make_input_fn(is_training=True)
+      dataset = input_fn(params)
+
    first_epoch = self.drain_dataset(dataset=dataset, g=g)
-    user_inv_map = {v: k for k, v in ncf_dataset.user_map.items()}
-    item_inv_map = {v: k for k, v in ncf_dataset.item_map.items()}

+    counts = defaultdict(int)
    train_examples = {
        True: set(),
        False: set(),
    }
+
    for features, labels in first_epoch:
-      for u, i, l in zip(features[movielens.USER_COLUMN],
-                         features[movielens.ITEM_COLUMN], labels):
+      for u, i, v, l in zip(
+          features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
+          features[rconst.VALID_POINT_MASK], labels):
+        if not v:
+          continue  # ignore padding

        u_raw = user_inv_map[u]
        i_raw = item_inv_map[i]
@@ -145,61 +169,139 @@ class BaseTest(tf.test.TestCase):
          # generation, so it will occasionally appear as a negative example
          # during training.
          assert not l
-          assert i_raw == self.holdout[u_raw][1]
+          self.assertEqual(i_raw, self.holdout[u_raw][1])
        train_examples[l].add((u_raw, i_raw))
+        counts[(u_raw, i_raw)] += 1
+
    num_positives_seen = len(train_examples[True])

-    assert ncf_dataset.num_train_positives == num_positives_seen
+    self.assertEqual(producer._train_pos_users.shape[0], num_positives_seen)

    # This check is more heuristic because negatives are sampled with
    # replacement. It only checks that negative generation is reasonably random.
-    assert len(train_examples[False]) / NUM_NEG / num_positives_seen > 0.9
-
-  def test_shard_randomness(self):
-    users = [0, 0, 0, 0, 1, 1, 1, 1]
-    items = [0, 2, 4, 6, 0, 2, 4, 6]
-    times = [1, 2, 3, 4, 1, 2, 3, 4]
-    df = pd.DataFrame({movielens.USER_COLUMN: users,
-                       movielens.ITEM_COLUMN: items,
-                       movielens.TIMESTAMP_COLUMN: times})
-    cache_paths = rconst.Paths(data_dir=self.temp_data_dir)
-    np.random.seed(1)
-
-    num_shards = 2
-    num_items = 10
-    data_preprocessing.generate_train_eval_data(
-        df, approx_num_shards=num_shards, num_items=num_items,
-        cache_paths=cache_paths, match_mlperf=True)
-
-    raw_shards = tf.gfile.ListDirectory(cache_paths.train_shard_subdir)
-    assert len(raw_shards) == num_shards
-
-    sharded_eval_data = []
-    for i in range(2):
-      sharded_eval_data.append(data_async_generation._process_shard(
-          (os.path.join(cache_paths.train_shard_subdir, raw_shards[i]),
-           num_items, rconst.NUM_EVAL_NEGATIVES, stat_utils.random_int32(),
-           False, True)))
-
-    if sharded_eval_data[0][0][0] == 1:
-      # Order is not assured for this part of the pipeline.
-      sharded_eval_data.reverse()
-
-    eval_data = [np.concatenate([shard[i] for shard in sharded_eval_data])
-                 for i in range(3)]
-    eval_data = {
-        movielens.USER_COLUMN: eval_data[0],
-        movielens.ITEM_COLUMN: eval_data[1],
-    }
+    self.assertGreater(
+        len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9)
+
+    # This checks that the samples produced are independent by checking the
+    # number of duplicate entries. If workers are not properly independent there
+    # will be lots of repeated pairs.
+    self.assertLess(np.mean(list(counts.values())), 1.1)
+
+    # ==========================================================================
+    # == Eval Data =============================================================
+    # ==========================================================================
+    with g.as_default():
+      input_fn = producer.make_input_fn(is_training=False)
+      dataset = input_fn(params)

-    eval_items_per_user = rconst.NUM_EVAL_NEGATIVES + 1
-    self.assertAllClose(eval_data[movielens.USER_COLUMN],
-                        [0] * eval_items_per_user + [1] * eval_items_per_user)
+    eval_data = self.drain_dataset(dataset=dataset, g=g)

-    # Each shard process should generate different random items.
-    self.assertNotAllClose(
-        eval_data[movielens.ITEM_COLUMN][:eval_items_per_user],
-        eval_data[movielens.ITEM_COLUMN][eval_items_per_user:])
+    current_user = None
+    for features in eval_data:
+      for idx, (u, i, d) in enumerate(zip(features[movielens.USER_COLUMN],
+                                          features[movielens.ITEM_COLUMN],
+                                          features[rconst.DUPLICATE_MASK])):
+        u_raw = user_inv_map[u]
+        i_raw = item_inv_map[i]
+        if current_user is None:
+          current_user = u
+
+        # Ensure that users appear in blocks, as the evaluation logic expects
+        # this structure.
+        self.assertEqual(u, current_user)
+
+        # The structure of evaluation data is 999 negative examples followed
+        # by the holdout positive.
+        if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1):
+          # Check that the last element in each chunk is the holdout item.
+          self.assertEqual(i_raw, self.holdout[u_raw][1])
+          current_user = None
+
+        elif i_raw == self.holdout[u_raw][1]:
+          # Because the holdout item is not given to the negative generation
+          # process, it can appear as a negative. In that case, it should be
+          # masked out as a duplicate. (Since the true positive is placed at
+          # the end and would therefore lose the tie.)
+          assert d
+
+        else:
+          # Otherwise check that the other 999 points for a user are selected
+          # from the negatives.
+          assert (u_raw, i_raw) not in self.seen_pairs
+
+  def test_fresh_randomness(self):
+    train_epochs = 5
+    params = self.make_params(train_epochs=train_epochs)
+    _, _, producer = data_preprocessing.instantiate_pipeline(
+        dataset=DATASET, data_dir=self.temp_data_dir, deterministic=False,
+        params=params)
+
+    producer.start()
+
+    results = []
+    g = tf.Graph()
+    with g.as_default():
+      for _ in range(train_epochs):
+        input_fn = producer.make_input_fn(is_training=True)
+        dataset = input_fn(params)
+        results.extend(self.drain_dataset(dataset=dataset, g=g))
+
+    producer.join()
+    assert producer._fatal_exception is None
+
+    positive_counts, negative_counts = defaultdict(int), defaultdict(int)
+    for features, labels in results:
+      for u, i, v, l in zip(
+          features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN],
+          features[rconst.VALID_POINT_MASK], labels):
+        if not v:
+          continue  # ignore padding
+
+        if l:
+          positive_counts[(u, i)] += 1
+        else:
+          negative_counts[(u, i)] += 1
+
+    # The positive examples should appear exactly once each epoch
+    self.assertAllEqual(list(positive_counts.values()),
+                        [train_epochs for _ in positive_counts])
+
+    # The threshold for the negatives is heuristic, but in general repeats are
+    # expected, but should not appear too frequently.
+
+    pair_cardinality = NUM_USERS * NUM_ITEMS
+    neg_pair_cardinality = pair_cardinality - len(self.seen_pairs)
+
+    # Approximation for the expectation number of times that a particular
+    # negative will appear in a given epoch. Implicit in this calculation is the
+    # treatment of all negative pairs as equally likely. Normally is not
+    # necessarily reasonable; however the generation in self.setUp() will
+    # approximate this behavior sufficiently for heuristic testing.
+    e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality
+
+    # The frequency of occurance of a given negative pair should follow an
+    # approximately binomial distribution in the limit that the cardinality of
+    # the negative pair set >> number of samples per epoch.
+    approx_pdf = scipy.stats.binom.pmf(k=np.arange(train_epochs+1),
+                                       n=train_epochs, p=e_sample)
+
+    # Tally the actual observed counts.
+    count_distribution = [0 for _ in range(train_epochs + 1)]
+    for i in negative_counts.values():
+      i = min([i, train_epochs])  # round down tail for simplicity.
+      count_distribution[i] += 1
+    count_distribution[0] = neg_pair_cardinality - sum(count_distribution[1:])
+
+    # Check that the frequency of negative pairs is approximately binomial.
+    for i in range(train_epochs + 1):
+      if approx_pdf[i] < 0.05:
+        continue  # Variance will be high at the tails.
+
+      observed_fraction = count_distribution[i] / neg_pair_cardinality
+      deviation = (2 * abs(observed_fraction - approx_pdf[i]) /
+                   (observed_fraction + approx_pdf[i]))
+
+      self.assertLess(deviation, 0.2)


 if __name__ == "__main__":

--- a/official/recommendation/model_runner.py
+++ b/official/recommendation/model_runner.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Contains NcfModelRunner, which can train and evaluate an NCF model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import namedtuple
-import os
-import time
-
-import tensorflow as tf
-
-from tensorflow.contrib.compiler import xla
-from official.recommendation import constants as rconst
-from official.recommendation import data_preprocessing
-from official.recommendation import neumf_model
-
-
-class NcfModelRunner(object):
-  """Creates a graph to train/evaluate an NCF model, and runs it.
-
-  This class builds both a training model and evaluation model in the graph.
-  The two models share variables, so that during evaluation, the trained
-  variables are used.
-  """
-
-  # _TrainModelProperties and _EvalModelProperties store useful properties of
-  # the training and evaluation models, respectively.
-  # _SHARED_MODEL_PROPERTY_FIELDS is their shared fields.
-  _SHARED_MODEL_PROPERTY_FIELDS = (
-      # A scalar tf.string placeholder tensor, that will be fed the path to the
-      # directory storing the TFRecord files for the input data.
-      "record_files_placeholder",
-      # The tf.data.Iterator to iterate over the input data.
-      "iterator",
-      # A scalar float tensor representing the model loss.
-      "loss",
-      # The batch size, as a Python int.
-      "batch_size",
-      # The op to run the model. For the training model, this trains the model
-      # for one step. For the evaluation model, this computes the metrics and
-      # updates the metric variables.
-      "run_model_op")
-  _TrainModelProperties = namedtuple("_TrainModelProperties",  # pylint: disable=invalid-name
-                                     _SHARED_MODEL_PROPERTY_FIELDS)
-  _EvalModelProperties = namedtuple(  # pylint: disable=invalid-name
-      "_EvalModelProperties", _SHARED_MODEL_PROPERTY_FIELDS + (
-          # A dict from metric name to metric tensor.
-          "metrics",
-          # Initializes the metric variables.
-          "metric_initializer",))
-
-  def __init__(self, ncf_dataset, params, num_train_steps, num_eval_steps,
-               use_while_loop):
-    self._num_train_steps = num_train_steps
-    self._num_eval_steps = num_eval_steps
-    self._use_while_loop = use_while_loop
-    with tf.Graph().as_default() as self._graph:
-      if params["use_xla_for_gpu"]:
-        # The XLA functions we use require resource variables.
-        tf.enable_resource_variables()
-      self._ncf_dataset = ncf_dataset
-      self._global_step = tf.train.create_global_step()
-      self._train_model_properties = self._build_model(params, num_train_steps,
-                                                       is_training=True)
-      self._eval_model_properties = self._build_model(params, num_eval_steps,
-                                                      is_training=False)
-
-      initializer = tf.global_variables_initializer()
-    self._graph.finalize()
-    self._session = tf.Session(graph=self._graph)
-    self._session.run(initializer)
-
-  def _compute_metric_mean(self, metric_name):
-    """Computes the mean from a call tf tf.metrics.mean().
-
-    tf.metrics.mean() already returns the mean, so normally this call is
-    unnecessary. But, if tf.metrics.mean() is called inside a tf.while_loop, the
-    mean cannot be accessed outside the while loop. Calling this function
-    recomputes the mean from the variables created by tf.metrics.mean(),
-    allowing the mean to be accessed outside the while loop.
-
-    Args:
-      metric_name: The string passed to the 'name' argument of tf.metrics.mean()
-
-    Returns:
-      The mean of the metric.
-    """
-    metric_vars = tf.get_collection(tf.GraphKeys.METRIC_VARIABLES)
-    total_suffix = metric_name + "/total:0"
-    total_vars = [v for v in metric_vars if v.name.endswith(total_suffix)]
-    assert len(total_vars) == 1., (
-        "Found {} metric variables ending with '{}' but expected to find "
-        "exactly 1. All metric variables: {}".format(
-            len(total_vars), total_suffix, metric_vars))
-    total_var = total_vars[0]
-
-    count_suffix = metric_name + "/count:0"
-    count_vars = [v for v in metric_vars if v.name.endswith(count_suffix)]
-    assert len(count_vars) == 1., (
-        "Found {} metric variables ending with '{}' but expected to find "
-        "exactly 1. All metric variables: {}".format(
-            len(count_vars), count_suffix, metric_vars))
-    count_var = count_vars[0]
-    return total_var / count_var
-
-
-  def _build_model(self, params, num_steps, is_training):
-    """Builds the NCF model.
-
-    Args:
-      params: A dict of hyperparameters.
-      is_training: If True, build the training model. If False, build the
-        evaluation model.
-    Returns:
-      A _TrainModelProperties if is_training is True, or an _EvalModelProperties
-      otherwise.
-    """
-    record_files_placeholder = tf.placeholder(tf.string, ())
-    input_fn, _, _ = \
-      data_preprocessing.make_input_fn(
-          ncf_dataset=self._ncf_dataset, is_training=is_training,
-          record_files=record_files_placeholder)
-    dataset = input_fn(params)
-    iterator = dataset.make_initializable_iterator()
-
-    model_fn = neumf_model.neumf_model_fn
-    if params["use_xla_for_gpu"]:
-      model_fn = xla.estimator_model_fn(model_fn)
-
-    if is_training:
-      return self._build_train_specific_graph(
-          iterator, model_fn, params, record_files_placeholder, num_steps)
-    else:
-      return self._build_eval_specific_graph(
-          iterator, model_fn, params, record_files_placeholder, num_steps)
-
-  def _build_train_specific_graph(self, iterator, model_fn, params,
-                                  record_files_placeholder, num_train_steps):
-    """Builds the part of the model that is specific to training."""
-
-    def build():
-      features, labels = iterator.get_next()
-      estimator_spec = model_fn(
-          features, labels, tf.estimator.ModeKeys.TRAIN, params)
-      with tf.control_dependencies([estimator_spec.train_op]):
-        run_model_op = self._global_step.assign_add(1)
-      return run_model_op, estimator_spec.loss
-
-    if self._use_while_loop:
-      def body(i):
-        run_model_op_single_step, _ = build()
-        with tf.control_dependencies([run_model_op_single_step]):
-          return i + 1
-
-      run_model_op = tf.while_loop(lambda i: i < num_train_steps, body, [0],
-                                   parallel_iterations=1)
-      loss = None
-    else:
-      run_model_op, loss = build()
-
-    return self._TrainModelProperties(
-        record_files_placeholder, iterator, loss, params["batch_size"],
-        run_model_op)
-
-  def _build_eval_specific_graph(self, iterator, model_fn, params,
-                                 record_files_placeholder, num_eval_steps):
-    """Builds the part of the model that is specific to evaluation."""
-
-    def build():
-      features = iterator.get_next()
-      estimator_spec = model_fn(
-          features, None, tf.estimator.ModeKeys.EVAL, params)
-      run_model_op = tf.group(*(update_op for _, update_op in
-                                estimator_spec.eval_metric_ops.values()))
-      eval_metric_tensors = {k: tensor for (k, (tensor, _))
-                             in estimator_spec.eval_metric_ops.items()}
-      return run_model_op, estimator_spec.loss, eval_metric_tensors
-
-    if self._use_while_loop:
-      def body(i):
-        run_model_op_single_step, _, _ = build()
-        with tf.control_dependencies([run_model_op_single_step]):
-          return i + 1
-
-      run_model_op = tf.while_loop(lambda i: i < num_eval_steps, body, [0],
-                                   parallel_iterations=1)
-      loss = None
-      eval_metric_tensors = {
-          "HR": self._compute_metric_mean(rconst.HR_METRIC_NAME),
-          "NDCG": self._compute_metric_mean(rconst.NDCG_METRIC_NAME),
-      }
-    else:
-      run_model_op, loss, eval_metric_tensors = build()
-
-    metric_initializer = tf.variables_initializer(
-        tf.get_collection(tf.GraphKeys.METRIC_VARIABLES))
-    return self._EvalModelProperties(
-        record_files_placeholder, iterator, loss, params["eval_batch_size"],
-        run_model_op, eval_metric_tensors, metric_initializer)
-
-  def _train_or_eval(self, model_properties, num_steps, is_training):
-    """Either trains or evaluates, depending on whether `is_training` is True.
-
-    Args:
-      model_properties: _TrainModelProperties or an _EvalModelProperties
-        containing the properties of the training or evaluation graph.
-      num_steps: The number of steps to train or evaluate for.
-      is_training: If True, run the training model. If False, run the evaluation
-        model.
-
-    Returns:
-      record_dir: The directory of TFRecords where the training/evaluation input
-      data was read from.
-    """
-    if self._ncf_dataset is not None:
-      epoch_metadata, record_dir, template = data_preprocessing.get_epoch_info(
-          is_training=is_training, ncf_dataset=self._ncf_dataset)
-      batch_count = epoch_metadata["batch_count"]
-      if batch_count != num_steps:
-        raise ValueError(
-            "Step counts do not match. ({} vs. {}) The async process is "
-            "producing incorrect shards.".format(batch_count, num_steps))
-      record_files = os.path.join(record_dir, template.format("*"))
-      initializer_feed_dict = {
-          model_properties.record_files_placeholder: record_files}
-      del batch_count
-    else:
-      initializer_feed_dict = None
-      record_dir = None
-
-    self._session.run(model_properties.iterator.initializer,
-                      initializer_feed_dict)
-    fetches = (model_properties.run_model_op,)
-    if model_properties.loss is not None:
-      fetches += (model_properties.loss,)
-    mode = "Train" if is_training else "Eval"
-    start = None
-    times_to_run = 1 if self._use_while_loop else num_steps
-    for i in range(times_to_run):
-      fetches_ = self._session.run(fetches)
-      if i % 100 == 0:
-        if start is None:
-          # Only start the timer after 100 steps so there is a warmup.
-          start = time.time()
-          start_step = i
-        if model_properties.loss is not None:
-          _, loss = fetches_
-          tf.logging.info("{} Loss = {}".format(mode, loss))
-    end = time.time()
-    if start is not None:
-      print("{} peformance: {} examples/sec".format(
-          mode, (i - start_step) * model_properties.batch_size / (end - start)))
-    return record_dir
-
-
-  def train(self):
-    """Trains the graph for a single cycle."""
-    record_dir = self._train_or_eval(self._train_model_properties,
-                                     self._num_train_steps, is_training=True)
-    if record_dir:
-      # We delete the record_dir because each cycle, new TFRecords is generated
-      # by the async process.
-      tf.gfile.DeleteRecursively(record_dir)
-
-  def eval(self):
-    """Evaluates the graph on the eval data.
-
-    Returns:
-      A dict of evaluation results.
-    """
-    self._session.run(self._eval_model_properties.metric_initializer)
-    self._train_or_eval(self._eval_model_properties, self._num_eval_steps,
-                        is_training=False)
-    eval_results = {
-        'global_step': self._session.run(self._global_step)}
-    for key, val in self._eval_model_properties.metrics.items():
-      val_ = self._session.run(val)
-      tf.logging.info("{} = {}".format(key, self._session.run(val)))
-      eval_results[key] = val_
-    return eval_results
--- a/official/recommendation/ncf_main.py
+++ b/official/recommendation/ncf_main.py
@@ -24,6 +24,8 @@ from __future__ import print_function

 import contextlib
 import heapq
+import json
+import logging
 import math
 import multiprocessing
 import os
@@ -40,8 +42,8 @@ import tensorflow as tf
 from tensorflow.contrib.compiler import xla
 from official.datasets import movielens
 from official.recommendation import constants as rconst
+from official.recommendation import data_pipeline
 from official.recommendation import data_preprocessing
-from official.recommendation import model_runner
 from official.recommendation import neumf_model
 from official.utils.flags import core as flags_core
 from official.utils.logs import hooks_helper
@@ -54,74 +56,129 @@ from official.utils.misc import model_helpers
 FLAGS = flags.FLAGS


-def construct_estimator(num_gpus, model_dir, iterations, params, batch_size,
-                        eval_batch_size):
+def construct_estimator(model_dir, params):
  """Construct either an Estimator or TPUEstimator for NCF.

  Args:
-    num_gpus: The number of gpus (Used to select distribution strategy)
    model_dir: The model directory for the estimator
-    iterations:  Estimator iterations
    params: The params dict for the estimator
-    batch_size: The mini-batch size for training.
-    eval_batch_size: The batch size used during evaluation.

  Returns:
    An Estimator or TPUEstimator.
  """

  if params["use_tpu"]:
+    # Some of the networking libraries are quite chatty.
+    for name in ["googleapiclient.discovery", "googleapiclient.discovery_cache",
+                 "oauth2client.transport"]:
+      logging.getLogger(name).setLevel(logging.ERROR)
+
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        tpu=params["tpu"],
        zone=params["tpu_zone"],
        project=params["tpu_gcp_project"],
+        coordinator_name="coordinator"
    )
+
    tf.logging.info("Issuing reset command to TPU to ensure a clean state.")
    tf.Session.reset(tpu_cluster_resolver.get_master())

-    tpu_config = tf.contrib.tpu.TPUConfig(
-        iterations_per_loop=iterations,
-        num_shards=8)
-
-    run_config = tf.contrib.tpu.RunConfig(
-        cluster=tpu_cluster_resolver,
-        model_dir=model_dir,
-        save_checkpoints_secs=600,
-        session_config=tf.ConfigProto(
-            allow_soft_placement=True, log_device_placement=False),
-        tpu_config=tpu_config)
-
-    tpu_params = {k: v for k, v in params.items() if k != "batch_size"}
-
-    train_estimator = tf.contrib.tpu.TPUEstimator(
-        model_fn=neumf_model.neumf_model_fn,
-        use_tpu=True,
-        train_batch_size=batch_size,
-        eval_batch_size=eval_batch_size,
-        params=tpu_params,
-        config=run_config)
-
-    eval_estimator = tf.contrib.tpu.TPUEstimator(
-        model_fn=neumf_model.neumf_model_fn,
-        use_tpu=True,
-        train_batch_size=1,
-        eval_batch_size=eval_batch_size,
-        params=tpu_params,
-        config=run_config)
-
-    return train_estimator, eval_estimator
-
-  distribution = distribution_utils.get_distribution_strategy(num_gpus=num_gpus)
+    # Estimator looks at the master it connects to for MonitoredTrainingSession
+    # by reading the `TF_CONFIG` environment variable, and the coordinator
+    # is used by StreamingFilesDataset.
+    tf_config_env = {
+        "session_master": tpu_cluster_resolver.get_master(),
+        "eval_session_master": tpu_cluster_resolver.get_master(),
+        "coordinator": tpu_cluster_resolver.cluster_spec()
+                       .as_dict()["coordinator"]
+    }
+    os.environ['TF_CONFIG'] = json.dumps(tf_config_env)
+
+    distribution = tf.contrib.distribute.TPUStrategy(
+        tpu_cluster_resolver, 100, params["batches_per_step"])
+
+  else:
+    distribution = distribution_utils.get_distribution_strategy(
+        num_gpus=params["num_gpus"])
+
  run_config = tf.estimator.RunConfig(train_distribute=distribution,
                                      eval_distribute=distribution)
-  params["eval_batch_size"] = eval_batch_size
+
  model_fn = neumf_model.neumf_model_fn
  if params["use_xla_for_gpu"]:
    tf.logging.info("Using XLA for GPU for training and evaluation.")
    model_fn = xla.estimator_model_fn(model_fn)
  estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=model_dir,
                                     config=run_config, params=params)
-  return estimator, estimator
+  return estimator
+
+
+def log_and_get_hooks(eval_batch_size):
+  """Convenience method for hook and logger creation."""
+  # Create hooks that log information about the training and metric values
+  train_hooks = hooks_helper.get_train_hooks(
+      FLAGS.hooks,
+      model_dir=FLAGS.model_dir,
+      batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
+      tensors_to_log={"cross_entropy": "cross_entropy"}
+  )
+  run_params = {
+      "batch_size": FLAGS.batch_size,
+      "eval_batch_size": eval_batch_size,
+      "number_factors": FLAGS.num_factors,
+      "hr_threshold": FLAGS.hr_threshold,
+      "train_epochs": FLAGS.train_epochs,
+  }
+  benchmark_logger = logger.get_benchmark_logger()
+  benchmark_logger.log_run_info(
+      model_name="recommendation",
+      dataset_name=FLAGS.dataset,
+      run_params=run_params,
+      test_id=FLAGS.benchmark_test_id)
+
+  return benchmark_logger, train_hooks
+
+
+def parse_flags(flags_obj):
+  """Convenience method to turn flags into params."""
+  num_gpus = flags_core.get_num_gpus(flags_obj)
+  num_devices = FLAGS.num_tpu_shards if FLAGS.tpu else num_gpus or 1
+
+  batch_size = distribution_utils.per_device_batch_size(
+      (int(flags_obj.batch_size) + num_devices - 1) //
+      num_devices * num_devices, num_devices)
+
+  eval_divisor = (rconst.NUM_EVAL_NEGATIVES + 1) * num_devices
+  eval_batch_size = int(flags_obj.eval_batch_size or flags_obj.batch_size or 1)
+  eval_batch_size = distribution_utils.per_device_batch_size(
+      (eval_batch_size + eval_divisor - 1) //
+      eval_divisor * eval_divisor, num_devices)
+
+  return {
+      "train_epochs": flags_obj.train_epochs,
+      "batches_per_step": num_devices,
+      "use_seed": flags_obj.seed is not None,
+      "hash_pipeline": flags_obj.hash_pipeline,
+      "batch_size": batch_size,
+      "eval_batch_size": eval_batch_size,
+      "learning_rate": flags_obj.learning_rate,
+      "mf_dim": flags_obj.num_factors,
+      "model_layers": [int(layer) for layer in flags_obj.layers],
+      "mf_regularization": flags_obj.mf_regularization,
+      "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
+      "num_neg": flags_obj.num_neg,
+      "num_gpus": num_gpus,
+      "use_tpu": flags_obj.tpu is not None,
+      "tpu": flags_obj.tpu,
+      "tpu_zone": flags_obj.tpu_zone,
+      "tpu_gcp_project": flags_obj.tpu_gcp_project,
+      "beta1": flags_obj.beta1,
+      "beta2": flags_obj.beta2,
+      "epsilon": flags_obj.epsilon,
+      "match_mlperf": flags_obj.ml_perf,
+      "use_xla_for_gpu": flags_obj.use_xla_for_gpu,
+      "epochs_between_evals": FLAGS.epochs_between_evals,
+  }


 def main(_):
@@ -129,7 +186,6 @@ def main(_):
       mlperf_helper.LOGGER(FLAGS.output_ml_perf_compliance_logging):
    mlperf_helper.set_ncf_root(os.path.split(os.path.abspath(__file__))[0])
    run_ncf(FLAGS)
-    mlperf_helper.stitch_ncf()


 def run_ncf(_):
@@ -140,105 +196,35 @@ def run_ncf(_):
  if FLAGS.seed is not None:
    np.random.seed(FLAGS.seed)

-  num_gpus = flags_core.get_num_gpus(FLAGS)
-  batch_size = distribution_utils.per_device_batch_size(
-      int(FLAGS.batch_size), num_gpus)
+  params = parse_flags(FLAGS)
  total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals

-  eval_per_user = rconst.NUM_EVAL_NEGATIVES + 1
-  eval_batch_size = int(FLAGS.eval_batch_size or
-                        max([FLAGS.batch_size, eval_per_user]))
-  if eval_batch_size % eval_per_user:
-    eval_batch_size = eval_batch_size // eval_per_user * eval_per_user
-    tf.logging.warning(
-        "eval examples per user does not evenly divide eval_batch_size. "
-        "Overriding to {}".format(eval_batch_size))
-
  if FLAGS.use_synthetic_data:
-    ncf_dataset = None
-    cleanup_fn = lambda: None
+    producer = data_pipeline.DummyConstructor()
    num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
        FLAGS.dataset]
-    num_train_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
-    num_eval_steps = data_preprocessing.SYNTHETIC_BATCHES_PER_EPOCH
+    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
+    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
  else:
-    ncf_dataset, cleanup_fn = data_preprocessing.instantiate_pipeline(
+    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset, data_dir=FLAGS.data_dir,
-        batch_size=batch_size,
-        eval_batch_size=eval_batch_size,
-        num_neg=FLAGS.num_neg,
-        epochs_per_cycle=FLAGS.epochs_between_evals,
-        num_cycles=total_training_cycle,
-        match_mlperf=FLAGS.ml_perf,
-        deterministic=FLAGS.seed is not None,
-        use_subprocess=FLAGS.use_subprocess,
-        cache_id=FLAGS.cache_id)
-    num_users = ncf_dataset.num_users
-    num_items = ncf_dataset.num_items
-    num_train_steps = int(np.ceil(
-        FLAGS.epochs_between_evals * ncf_dataset.num_train_positives *
-        (1 + FLAGS.num_neg) / FLAGS.batch_size))
-    num_eval_steps = int(np.ceil((1 + rconst.NUM_EVAL_NEGATIVES) *
-                                 ncf_dataset.num_users / eval_batch_size))
+        deterministic=FLAGS.seed is not None, params=params)

-  model_helpers.apply_clean(flags.FLAGS)
+    num_train_steps = (producer.train_batches_per_epoch //
+                       params["batches_per_step"])
+    num_eval_steps = (producer.eval_batches_per_epoch //
+                      params["batches_per_step"])
+    assert not producer.train_batches_per_epoch % params["batches_per_step"]
+    assert not producer.eval_batches_per_epoch % params["batches_per_step"]
+  producer.start()

-  params = {
-      "use_seed": FLAGS.seed is not None,
-      "hash_pipeline": FLAGS.hash_pipeline,
-      "batch_size": batch_size,
-      "eval_batch_size": eval_batch_size,
-      "learning_rate": FLAGS.learning_rate,
-      "num_users": num_users,
-      "num_items": num_items,
-      "mf_dim": FLAGS.num_factors,
-      "model_layers": [int(layer) for layer in FLAGS.layers],
-      "mf_regularization": FLAGS.mf_regularization,
-      "mlp_reg_layers": [float(reg) for reg in FLAGS.mlp_regularization],
-      "num_neg": FLAGS.num_neg,
-      "use_tpu": FLAGS.tpu is not None,
-      "tpu": FLAGS.tpu,
-      "tpu_zone": FLAGS.tpu_zone,
-      "tpu_gcp_project": FLAGS.tpu_gcp_project,
-      "beta1": FLAGS.beta1,
-      "beta2": FLAGS.beta2,
-      "epsilon": FLAGS.epsilon,
-      "match_mlperf": FLAGS.ml_perf,
-      "use_xla_for_gpu": FLAGS.use_xla_for_gpu,
-      "use_estimator": FLAGS.use_estimator,
-  }
-  if FLAGS.use_estimator:
-    train_estimator, eval_estimator = construct_estimator(
-        num_gpus=num_gpus, model_dir=FLAGS.model_dir,
-        iterations=num_train_steps, params=params,
-        batch_size=flags.FLAGS.batch_size, eval_batch_size=eval_batch_size)
-  else:
-    runner = model_runner.NcfModelRunner(ncf_dataset, params, num_train_steps,
-                                         num_eval_steps, FLAGS.use_while_loop)
+  params["num_users"], params["num_items"] = num_users, num_items
+  model_helpers.apply_clean(flags.FLAGS)

-  # Create hooks that log information about the training and metric values
-  train_hooks = hooks_helper.get_train_hooks(
-      FLAGS.hooks,
-      model_dir=FLAGS.model_dir,
-      batch_size=FLAGS.batch_size,  # for ExamplesPerSecondHook
-      tensors_to_log={"cross_entropy": "cross_entropy"}
-  )
-  run_params = {
-      "batch_size": FLAGS.batch_size,
-      "eval_batch_size": eval_batch_size,
-      "number_factors": FLAGS.num_factors,
-      "hr_threshold": FLAGS.hr_threshold,
-      "train_epochs": FLAGS.train_epochs,
-  }
-  benchmark_logger = logger.get_benchmark_logger()
-  benchmark_logger.log_run_info(
-      model_name="recommendation",
-      dataset_name=FLAGS.dataset,
-      run_params=run_params,
-      test_id=FLAGS.benchmark_test_id)
+  estimator = construct_estimator(model_dir=FLAGS.model_dir, params=params)

+  benchmark_logger, train_hooks = log_and_get_hooks(params["eval_batch_size"])

-  eval_input_fn = None
  target_reached = False
  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_LOOP)
  for cycle_index in range(total_training_cycle):
@@ -249,47 +235,21 @@ def run_ncf(_):
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.TRAIN_EPOCH,
                            value=cycle_index)

-    # Train the model
-    if FLAGS.use_estimator:
-      train_input_fn, train_record_dir, batch_count = \
-        data_preprocessing.make_input_fn(
-            ncf_dataset=ncf_dataset, is_training=True)
-
-      if batch_count != num_train_steps:
-        raise ValueError(
-            "Step counts do not match. ({} vs. {}) The async process is "
-            "producing incorrect shards.".format(batch_count, num_train_steps))
-
-      train_estimator.train(input_fn=train_input_fn, hooks=train_hooks,
-                            steps=num_train_steps)
-      if train_record_dir:
-        tf.gfile.DeleteRecursively(train_record_dir)
-
-      tf.logging.info("Beginning evaluation.")
-      if eval_input_fn is None:
-        eval_input_fn, _, eval_batch_count = data_preprocessing.make_input_fn(
-            ncf_dataset=ncf_dataset, is_training=False)
-
-        if eval_batch_count != num_eval_steps:
-          raise ValueError(
-              "Step counts do not match. ({} vs. {}) The async process is "
-              "producing incorrect shards.".format(
-                  eval_batch_count, num_eval_steps))
-
-      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
-                              value=cycle_index)
-      eval_results = eval_estimator.evaluate(eval_input_fn,
-                                             steps=num_eval_steps)
-      tf.logging.info("Evaluation complete.")
-    else:
-      runner.train()
-      tf.logging.info("Beginning evaluation.")
-      mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
-                              value=cycle_index)
-      eval_results = runner.eval()
-      tf.logging.info("Evaluation complete.")
+    train_input_fn = producer.make_input_fn(is_training=True)
+    estimator.train(input_fn=train_input_fn, hooks=train_hooks,
+                    steps=num_train_steps)
+
+    tf.logging.info("Beginning evaluation.")
+    eval_input_fn = producer.make_input_fn(is_training=False)
+
+    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_START,
+                            value=cycle_index)
+    eval_results = estimator.evaluate(eval_input_fn, steps=num_eval_steps)
+    tf.logging.info("Evaluation complete.")
+
    hr = float(eval_results[rconst.HR_KEY])
    ndcg = float(eval_results[rconst.NDCG_KEY])
+    loss = float(eval_results["loss"])

    mlperf_helper.ncf_print(
        key=mlperf_helper.TAGS.EVAL_TARGET,
@@ -300,18 +260,14 @@ def run_ncf(_):
        key=mlperf_helper.TAGS.EVAL_HP_NUM_NEG,
        value={"epoch": cycle_index, "value": rconst.NUM_EVAL_NEGATIVES})

-    # Logged by the async process during record creation.
-    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_HP_NUM_USERS,
-                            deferred=True)
-
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.EVAL_STOP, value=cycle_index)

    # Benchmark the evaluation results
    benchmark_logger.log_evaluation_result(eval_results)
    # Log the HR and NDCG results.
    tf.logging.info(
-        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format(
-            cycle_index + 1, hr, ndcg))
+        "Iteration {}: HR = {:.4f}, NDCG = {:.4f}, Loss = {:.4f}".format(
+            cycle_index + 1, hr, ndcg, loss))

    # If some evaluation threshold is met
    if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr):
@@ -320,7 +276,8 @@ def run_ncf(_):

  mlperf_helper.ncf_print(key=mlperf_helper.TAGS.RUN_STOP,
                          value={"success": target_reached})
-  cleanup_fn()  # Cleanup data construction artifacts and subprocess.
+  producer.stop_loop()
+  producer.join()

  # Clear the session explicitly to avoid session delete error
  tf.keras.backend.clear_session()
@@ -472,18 +429,6 @@ def define_ncf_flags():
    return (eval_batch_size is None or
            int(eval_batch_size) > rconst.NUM_EVAL_NEGATIVES)

-  flags.DEFINE_bool(
-      name="use_subprocess", default=True, help=flags_core.help_wrap(
-          "By default, ncf_main.py starts async data generation process as a "
-          "subprocess. If set to False, ncf_main.py will assume the async data "
-          "generation process has already been started by the user."))
-
-  flags.DEFINE_integer(name="cache_id", default=None, help=flags_core.help_wrap(
-      "Use a specified cache_id rather than using a timestamp. This is only "
-      "needed to synchronize across multiple workers. Generally this flag will "
-      "not need to be set."
-  ))
-
  flags.DEFINE_bool(
      name="use_xla_for_gpu", default=False, help=flags_core.help_wrap(
          "If True, use XLA for the model function. Only works when using a "
@@ -494,30 +439,6 @@ def define_ncf_flags():
  def xla_validator(flag_dict):
    return not flag_dict["use_xla_for_gpu"] or not flag_dict["tpu"]

-  flags.DEFINE_bool(
-      name="use_estimator", default=True, help=flags_core.help_wrap(
-          "If True, use Estimator to train. Setting to False is slightly "
-          "faster, but when False, the following are currently unsupported:\n"
-          "  * Using TPUs\n"
-          "  * Using more than 1 GPU\n"
-          "  * Reloading from checkpoints\n"
-          "  * Any hooks specified with --hooks\n"))
-
-  flags.DEFINE_bool(
-      name="use_while_loop", default=None, help=flags_core.help_wrap(
-          "If set, run an entire epoch in a session.run() call using a "
-          "TensorFlow while loop. This can improve performance, but will not "
-          "print out losses throughout the epoch. Requires "
-          "--use_estimator=false"
-      ))
-
-  xla_message = "--use_while_loop requires --use_estimator=false"
-  @flags.multi_flags_validator(["use_while_loop", "use_estimator"],
-                               message=xla_message)
-  def while_loop_validator(flag_dict):
-    return (not flag_dict["use_while_loop"] or
-            not flag_dict["use_estimator"])
-

 if __name__ == "__main__":
  tf.logging.set_verbosity(tf.logging.INFO)

--- a/official/recommendation/ncf_test.py
+++ b/official/recommendation/ncf_test.py
@@ -27,6 +27,7 @@ import tensorflow as tf
 from absl import flags
 from absl.testing import flagsaver
 from official.recommendation import constants as rconst
+from official.recommendation import data_pipeline
 from official.recommendation import data_preprocessing
 from official.recommendation import neumf_model
 from official.recommendation import ncf_main
@@ -56,6 +57,13 @@ class NcfTest(tf.test.TestCase):
                            top_k=rconst.TOP_K, match_mlperf=False):
    rconst.TOP_K = top_k
    rconst.NUM_EVAL_NEGATIVES = predicted_scores_by_user.shape[1] - 1
+    batch_size = items_by_user.shape[0]
+
+    users = np.repeat(np.arange(batch_size)[:, np.newaxis],
+                      rconst.NUM_EVAL_NEGATIVES + 1, axis=1)
+    users, items, duplicate_mask = \
+      data_pipeline.BaseDataConstructor._assemble_eval_batch(
+          users, items_by_user[:, -1:], items_by_user[:, :-1], batch_size)

    g = tf.Graph()
    with g.as_default():
@@ -63,8 +71,7 @@ class NcfTest(tf.test.TestCase):
          predicted_scores_by_user.reshape((-1, 1)), tf.float32)
      softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype),
                                  logits], axis=1)
-      duplicate_mask = tf.convert_to_tensor(
-          stat_utils.mask_duplicates(items_by_user, axis=1), tf.float32)
+      duplicate_mask = tf.convert_to_tensor(duplicate_mask, tf.float32)

      metric_ops = neumf_model.compute_eval_loss_and_metrics(
          logits=logits, softmax_logits=softmax_logits,
@@ -81,21 +88,19 @@ class NcfTest(tf.test.TestCase):
      sess.run(init)
      return sess.run([hr[1], ndcg[1]])

-
-
  def test_hit_rate_and_ndcg(self):
    # Test with no duplicate items
    predictions = np.array([
-        [1., 2., 0.],  # In top 2
-        [2., 1., 0.],  # In top 1
-        [0., 2., 1.],  # In top 3
-        [2., 3., 4.]   # In top 3
+        [2., 0., 1.],  # In top 2
+        [1., 0., 2.],  # In top 1
+        [2., 1., 0.],  # In top 3
+        [3., 4., 2.]   # In top 3
    ])
    items = np.array([
-        [1, 2, 3],
        [2, 3, 1],
-        [3, 2, 1],
+        [3, 1, 2],
        [2, 1, 3],
+        [1, 3, 2],
    ])

    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1)
@@ -130,16 +135,16 @@ class NcfTest(tf.test.TestCase):
    # Test with duplicate items. In the MLPerf case, we treat the duplicates as
    # a single item. Otherwise, we treat the duplicates as separate items.
    predictions = np.array([
-        [1., 2., 2., 3.],  # In top 4. MLPerf: In top 3
-        [3., 1., 0., 2.],  # In top 1. MLPerf: In top 1
-        [0., 2., 3., 2.],  # In top 4. MLPerf: In top 3
-        [3., 2., 4., 2.]   # In top 2. MLPerf: In top 2
+        [2., 2., 3., 1.],  # In top 4. MLPerf: In top 3
+        [1., 0., 2., 3.],  # In top 1. MLPerf: In top 1
+        [2., 3., 2., 0.],  # In top 4. MLPerf: In top 3
+        [2., 4., 2., 3.]   # In top 2. MLPerf: In top 2
    ])
    items = np.array([
-        [1, 2, 2, 3],
-        [1, 2, 3, 4],
-        [1, 2, 3, 2],
-        [4, 3, 2, 1],
+        [2, 2, 3, 1],
+        [2, 3, 4, 1],
+        [2, 3, 2, 1],
+        [3, 2, 1, 4],
    ])
    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1)
    self.assertAlmostEqual(hr, 1 / 4)
@@ -180,59 +185,6 @@ class NcfTest(tf.test.TestCase):
    self.assertAlmostEqual(ndcg, (1 + math.log(2) / math.log(3) +
                                  2 * math.log(2) / math.log(4)) / 4)

-    # Test with duplicate items, where the predictions for the same item can
-    # differ. In the MLPerf case, we should take the first prediction.
-    predictions = np.array([
-        [3., 2., 4., 4.],  # In top 3. MLPerf: In top 2
-        [3., 4., 2., 4.],  # In top 3. MLPerf: In top 3
-        [2., 3., 4., 1.],  # In top 3. MLPerf: In top 2
-        [4., 3., 5., 2.]   # In top 2. MLPerf: In top 1
-    ])
-    items = np.array([
-        [1, 2, 2, 3],
-        [4, 3, 3, 2],
-        [2, 1, 1, 1],
-        [4, 2, 2, 1],
-    ])
-    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1)
-    self.assertAlmostEqual(hr, 0 / 4)
-    self.assertAlmostEqual(ndcg, 0 / 4)
-
-    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 2)
-    self.assertAlmostEqual(hr, 1 / 4)
-    self.assertAlmostEqual(ndcg, (math.log(2) / math.log(3)) / 4)
-
-    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 3)
-    self.assertAlmostEqual(hr, 4 / 4)
-    self.assertAlmostEqual(ndcg, (math.log(2) / math.log(3) +
-                                  3 * math.log(2) / math.log(4)) / 4)
-
-    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 4)
-    self.assertAlmostEqual(hr, 4 / 4)
-    self.assertAlmostEqual(ndcg, (math.log(2) / math.log(3) +
-                                  3 * math.log(2) / math.log(4)) / 4)
-
-    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 1,
-                                          match_mlperf=True)
-    self.assertAlmostEqual(hr, 1 / 4)
-    self.assertAlmostEqual(ndcg, 1 / 4)
-
-    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 2,
-                                          match_mlperf=True)
-    self.assertAlmostEqual(hr, 3 / 4)
-    self.assertAlmostEqual(ndcg, (1 + 2 * math.log(2) / math.log(3)) / 4)
-
-    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 3,
-                                          match_mlperf=True)
-    self.assertAlmostEqual(hr, 4 / 4)
-    self.assertAlmostEqual(ndcg, (1 + 2 * math.log(2) / math.log(3) +
-                                  math.log(2) / math.log(4)) / 4)
-
-    hr, ndcg = self.get_hit_rate_and_ndcg(predictions, items, 4,
-                                          match_mlperf=True)
-    self.assertAlmostEqual(hr, 4 / 4)
-    self.assertAlmostEqual(ndcg, (1 + 2 * math.log(2) / math.log(3) +
-                                  math.log(2) / math.log(4)) / 4)

  _BASE_END_TO_END_FLAGS = {
      "batch_size": 1024,
@@ -241,33 +193,15 @@ class NcfTest(tf.test.TestCase):
  }

  @flagsaver.flagsaver(**_BASE_END_TO_END_FLAGS)
-  @mock.patch.object(data_preprocessing, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
+  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  def test_end_to_end(self):
    ncf_main.main(None)

  @flagsaver.flagsaver(ml_perf=True, **_BASE_END_TO_END_FLAGS)
-  @mock.patch.object(data_preprocessing, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
+  @mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
  def test_end_to_end_mlperf(self):
    ncf_main.main(None)

-  @flagsaver.flagsaver(use_estimator=False, **_BASE_END_TO_END_FLAGS)
-  @mock.patch.object(data_preprocessing, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
-  def test_end_to_end_no_estimator(self):
-    ncf_main.main(None)
-    flags.FLAGS.ml_perf = True
-    ncf_main.main(None)
-
-  @flagsaver.flagsaver(use_estimator=False, **_BASE_END_TO_END_FLAGS)
-  @mock.patch.object(data_preprocessing, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
-  def test_end_to_end_while_loop(self):
-    # We cannot set use_while_loop = True in the flagsaver constructor, because
-    # if the flagsaver sets it to True before setting use_estimator to False,
-    # the flag validator will throw an error.
-    flags.FLAGS.use_while_loop = True
-    ncf_main.main(None)
-    flags.FLAGS.ml_perf = True
-    ncf_main.main(None)
-

 if __name__ == "__main__":
  tf.logging.set_verbosity(tf.logging.INFO)

--- a/official/recommendation/neumf_model.py
+++ b/official/recommendation/neumf_model.py
@@ -76,44 +76,24 @@ def neumf_model_fn(features, labels, mode, params):
    tf.set_random_seed(stat_utils.random_int32())

  users = features[movielens.USER_COLUMN]
-  items = tf.cast(features[movielens.ITEM_COLUMN], tf.int32)
+  items = features[movielens.ITEM_COLUMN]

-  keras_model = params.get("keras_model")
-  if keras_model:
-    logits = keras_model([users, items],
-                         training=mode == tf.estimator.ModeKeys.TRAIN)
-  else:
-    keras_model = construct_model(users=users, items=items, params=params)
-    logits = keras_model.output
-  if not params["use_estimator"] and "keras_model" not in params:
-    # When we are not using estimator, we need to reuse the Keras model when
-    # this model_fn is called again, so that the variables are shared between
-    # training and eval. So we mutate params to add the Keras model.
-    params["keras_model"] = keras_model
+  logits = construct_model(users, items, params).output

  # Softmax with the first column of zeros is equivalent to sigmoid.
  softmax_logits = tf.concat([tf.zeros(logits.shape, dtype=logits.dtype),
                              logits], axis=1)

-  if mode == tf.estimator.ModeKeys.PREDICT:
-    predictions = {
-        movielens.ITEM_COLUMN: items,
-        movielens.RATING_COLUMN: logits,
-    }
-
-    if params["use_tpu"]:
-      return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, predictions=predictions)
-    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
-
-  elif mode == tf.estimator.ModeKeys.EVAL:
+  if mode == tf.estimator.ModeKeys.EVAL:
    duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
    return compute_eval_loss_and_metrics(
        logits, softmax_logits, duplicate_mask, params["num_neg"],
        params["match_mlperf"],
-        use_tpu_spec=params["use_tpu"] or params["use_xla_for_gpu"])
+        use_tpu_spec=params["use_xla_for_gpu"])

  elif mode == tf.estimator.ModeKeys.TRAIN:
    labels = tf.cast(labels, tf.int32)
+    valid_pt_mask = features[rconst.VALID_POINT_MASK]

    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_NAME, value="adam")
    mlperf_helper.ncf_print(key=mlperf_helper.TAGS.OPT_LR,
@@ -135,7 +115,8 @@ def neumf_model_fn(features, labels, mode, params):
                            value=mlperf_helper.TAGS.BCE)
    loss = tf.losses.sparse_softmax_cross_entropy(
        labels=labels,
-        logits=softmax_logits
+        logits=softmax_logits,
+        weights=tf.cast(valid_pt_mask, tf.float32)
    )

    # This tensor is used by logging hooks.
@@ -151,9 +132,6 @@ def neumf_model_fn(features, labels, mode, params):
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    train_op = tf.group(minimize_op, update_ops)

-    if params["use_tpu"]:
-      return tf.contrib.tpu.TPUEstimatorSpec(
-          mode=mode, loss=loss, train_op=train_op)
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

  else:
@@ -161,21 +139,18 @@ def neumf_model_fn(features, labels, mode, params):


 def construct_model(users, items, params):
-  # type: (tf.Tensor, tf.Tensor, dict) -> tf.Tensor
+  # type: (tf.Tensor, tf.Tensor, dict) -> tf.keras.Model
  """Initialize NeuMF model.

  Args:
    users: Tensor of user ids.
    items: Tensor of item ids.
    params: Dict of hyperparameters.
-
  Raises:
    ValueError: if the first model layer is not even.
-
  Returns:
-    logits:  network logits
+    model:  a keras Model for computing the logits
  """
-
  num_users = params["num_users"]
  num_items = params["num_items"]

@@ -194,82 +169,39 @@ def construct_model(users, items, params):
    raise ValueError("The first layer size should be multiple of 2!")

  # Input variables
-  user_input = tf.keras.layers.Input(tensor=users)
-  item_input = tf.keras.layers.Input(tensor=items)
-  batch_size = user_input.get_shape()[0]
-
-  if params["use_tpu"]:
-    with tf.variable_scope("embed_weights", reuse=tf.AUTO_REUSE):
-      cmb_embedding_user = tf.get_variable(
-          name="embeddings_mf_user",
-          shape=[num_users, mf_dim + model_layers[0] // 2],
-          initializer=tf.glorot_uniform_initializer())
-
-      cmb_embedding_item = tf.get_variable(
-          name="embeddings_mf_item",
-          shape=[num_items, mf_dim + model_layers[0] // 2],
-          initializer=tf.glorot_uniform_initializer())
-
-      cmb_user_latent = tf.keras.layers.Lambda(lambda ids: tf.gather(
-          cmb_embedding_user, ids))(user_input)
-
-      cmb_item_latent = tf.keras.layers.Lambda(lambda ids: tf.gather(
-          cmb_embedding_item, ids))(item_input)
-
-      mlp_user_latent = tf.keras.layers.Lambda(
-          lambda x: tf.slice(x, [0, 0], [batch_size, model_layers[0] // 2])
-      )(cmb_user_latent)
-
-      mlp_item_latent = tf.keras.layers.Lambda(
-          lambda x: tf.slice(x, [0, 0], [batch_size, model_layers[0] // 2])
-      )(cmb_item_latent)
-
-      mf_user_latent = tf.keras.layers.Lambda(
-          lambda x: tf.slice(x, [0, model_layers[0] // 2], [batch_size, mf_dim])
-      )(cmb_user_latent)
-
-      mf_item_latent = tf.keras.layers.Lambda(
-          lambda x: tf.slice(x, [0, model_layers[0] // 2], [batch_size, mf_dim])
-      )(cmb_item_latent)
-
-  else:
-    # Initializer for embedding layers
-    embedding_initializer = "glorot_uniform"
-
-    # Embedding layers of GMF and MLP
-    mf_embedding_user = tf.keras.layers.Embedding(
-        num_users,
-        mf_dim,
-        embeddings_initializer=embedding_initializer,
-        embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
-        input_length=1)
-    mf_embedding_item = tf.keras.layers.Embedding(
-        num_items,
-        mf_dim,
-        embeddings_initializer=embedding_initializer,
-        embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
-        input_length=1)
-
-    mlp_embedding_user = tf.keras.layers.Embedding(
-        num_users,
-        model_layers[0]//2,
-        embeddings_initializer=embedding_initializer,
-        embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]),
-        input_length=1)
-    mlp_embedding_item = tf.keras.layers.Embedding(
-        num_items,
-        model_layers[0]//2,
-        embeddings_initializer=embedding_initializer,
-        embeddings_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[0]),
-        input_length=1)
-
-    # GMF part
-    mf_user_latent = mf_embedding_user(user_input)
-    mf_item_latent = mf_embedding_item(item_input)
-
-    # MLP part
-    mlp_user_latent = mlp_embedding_user(user_input)
-    mlp_item_latent = mlp_embedding_item(item_input)
+  user_input = tf.keras.layers.Input(tensor=users, name="user_input")
+  item_input = tf.keras.layers.Input(tensor=items, name="item_input")
+
+  # Initializer for embedding layers
+  embedding_initializer = "glorot_uniform"
+
+  # It turns out to be significantly more effecient to store the MF and MLP
+  # embedding portions in the same table, and then slice as needed.
+  mf_slice_fn = lambda x: x[:, :mf_dim]
+  mlp_slice_fn = lambda x: x[:, mf_dim:]
+  embedding_user = tf.keras.layers.Embedding(
+      num_users, mf_dim + model_layers[0] // 2,
+      embeddings_initializer=embedding_initializer,
+      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
+      input_length=1, name="embedding_user")(user_input)
+
+  embedding_item = tf.keras.layers.Embedding(
+      num_items, mf_dim + model_layers[0] // 2,
+      embeddings_initializer=embedding_initializer,
+      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
+      input_length=1, name="embedding_item")(item_input)
+
+  # GMF part
+  mf_user_latent = tf.keras.layers.Lambda(
+      mf_slice_fn, name="embedding_user_mf")(embedding_user)
+  mf_item_latent = tf.keras.layers.Lambda(
+      mf_slice_fn, name="embedding_item_mf")(embedding_item)
+
+  # MLP part
+  mlp_user_latent = tf.keras.layers.Lambda(
+      mlp_slice_fn, name="embedding_user_mlp")(embedding_user)
+  mlp_item_latent = tf.keras.layers.Lambda(
+      mlp_slice_fn, name="embedding_item_mlp")(embedding_item)

  # Element-wise multiply
  mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent])
@@ -352,7 +284,7 @@ def compute_eval_loss_and_metrics(logits,              # type: tf.Tensor
  Args:
    logits: A tensor containing the predicted logits for each user. The shape
      of logits is (num_users_per_batch * (1 + NUM_EVAL_NEGATIVES),) Logits
-      for a user are grouped, and the first element of the group is the true
+      for a user are grouped, and the last element of the group is the true
      element.

    softmax_logits: The same tensor, but with zeros left-appended.
@@ -377,9 +309,9 @@ def compute_eval_loss_and_metrics(logits,              # type: tf.Tensor

  # Examples are provided by the eval Dataset in a structured format, so eval
  # labels can be reconstructed on the fly.
-  eval_labels = tf.reshape(tf.one_hot(
-      tf.zeros(shape=(logits_by_user.shape[0],), dtype=tf.int32),
-      logits_by_user.shape[1], dtype=tf.int32), (-1,))
+  eval_labels = tf.reshape(shape=(-1,), tensor=tf.one_hot(
+      tf.zeros(shape=(logits_by_user.shape[0],), dtype=tf.int32) +
+      rconst.NUM_EVAL_NEGATIVES, logits_by_user.shape[1], dtype=tf.int32))

  eval_labels_float = tf.cast(eval_labels, tf.float32)

@@ -463,7 +395,8 @@ def compute_top_k_and_ndcg(logits,              # type: tf.Tensor
  # perform matrix multiplications very quickly. This is similar to np.argwhere.
  # However this is a special case because the target will only appear in
  # sort_indices once.
-  one_hot_position = tf.cast(tf.equal(sort_indices, 0), tf.int32)
+  one_hot_position = tf.cast(tf.equal(sort_indices, rconst.NUM_EVAL_NEGATIVES),
+                             tf.int32)
  sparse_positions = tf.multiply(
      one_hot_position, tf.range(logits_by_user.shape[1])[tf.newaxis, :])
  position_vector = tf.reduce_sum(sparse_positions, axis=1)

--- a/official/recommendation/popen_helper.py
+++ b/official/recommendation/popen_helper.py
@@ -16,21 +16,15 @@

 import contextlib
 import multiprocessing
-import os
-import sys
+import multiprocessing.pool


-_PYTHON = sys.executable
-if not _PYTHON:
-  raise RuntimeError("Could not find path to Python interpreter in order to "
-                     "spawn subprocesses.")
+def get_forkpool(num_workers, init_worker=None, closing=True):
+  pool = multiprocessing.Pool(processes=num_workers, initializer=init_worker)
+  return contextlib.closing(pool) if closing else pool

-_ASYNC_GEN_PATH = os.path.join(os.path.dirname(__file__),
-                               "data_async_generation.py")

-INVOCATION = [_PYTHON, _ASYNC_GEN_PATH]
-
-
-def get_pool(num_workers, init_worker=None):
-  return contextlib.closing(multiprocessing.Pool(
-      processes=num_workers, initializer=init_worker))
+def get_threadpool(num_workers, init_worker=None, closing=True):
+  pool = multiprocessing.pool.ThreadPool(processes=num_workers,
+                                         initializer=init_worker)
+  return contextlib.closing(pool) if closing else pool
--- a/official/recommendation/run.sh
+++ b/official/recommendation/run.sh
@@ -27,7 +27,7 @@ mkdir -p ${LOCAL_TEST_DIR}

 TPU=${TPU:-""}
 if [[ -z ${TPU} ]]; then
-  DEVICE_FLAG="--num_gpus -1 --use_xla_for_gpu"
+  DEVICE_FLAG="--num_gpus -1" # --use_xla_for_gpu"
 else
  DEVICE_FLAG="--tpu ${TPU} --num_gpus 0"
 fi
@@ -64,15 +64,18 @@ do
                     --dataset ${DATASET} --hooks "" \
                     ${DEVICE_FLAG} \
                     --clean \
-                     --train_epochs 20 \
-                     --batch_size 2048 \
-                     --eval_batch_size 100000 \
-                     --learning_rate 0.0005 \
+                     --train_epochs 14 \
+                     --batch_size 98304 \
+                     --eval_batch_size 160000 \
+                     --learning_rate 0.00382059 \
+                     --beta1 0.783529 \
+                     --beta2 0.909003 \
+                     --epsilon 1.45439e-07 \
                     --layers 256,256,128,64 --num_factors 64 \
                     --hr_threshold 0.635 \
                     --ml_perf \
 |& tee ${RUN_LOG} \
- | grep --line-buffered  -E --regexp="(Iteration [0-9]+: HR = [0-9\.]+, NDCG = [0-9\.]+)|(pipeline_hash)|(MLPerf time:)"
+ | grep --line-buffered  -E --regexp="(Iteration [0-9]+: HR = [0-9\.]+, NDCG = [0-9\.]+, Loss = [0-9\.]+)|(pipeline_hash)|(MLPerf time:)"

  END_TIME=$(date +%s)
  echo "Run ${i} complete: $(( $END_TIME - $START_TIME )) seconds."

--- a/official/recommendation/stat_utils.py
+++ b/official/recommendation/stat_utils.py
@@ -18,71 +18,40 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import atexit
+from collections import deque
+import multiprocessing
+import os
+import struct
+import sys
+import threading
+import time
+
 import numpy as np

-def random_int32():
-  return np.random.randint(low=0, high=np.iinfo(np.int32).max, dtype=np.int32)
+from official.recommendation import popen_helper

-def sample_with_exclusion(num_items, positive_set, n, replacement=True):
-  # type: (int, typing.Iterable, int, bool) -> list
-  """Vectorized negative sampling.

-  This function samples from the positive set's conjugate, both with and
-  without replacement.
+def random_int32():
+  return np.random.randint(low=0, high=np.iinfo(np.int32).max, dtype=np.int32)

-  Performance:
-    This algorithm generates a vector of candidate values based on the expected
-    number needed such that at least k are not in the positive set, where k
-    is the number of false negatives still needed. An additional factor of
-    safety of 1.2 is used during the generation to minimize the chance of having
-    to perform another generation cycle.

-    While this approach generates more values than needed and then discards some
-    of them, vectorized generation is inexpensive and turns out to be much
-    faster than generating points one at a time. (And it defers quite a bit
-    of work to NumPy which has much better multi-core utilization than native
-    Python.)
+def permutation(args):
+  x, seed = args
+  seed = seed or struct.unpack("<L", os.urandom(4))[0]
+  state = np.random.RandomState(seed=seed)  # pylint: disable=no-member
+  output = np.arange(x, dtype=np.int32)
+  state.shuffle(output)
+  return output

-  Args:
-    num_items: The cardinality of the entire set of items.
-    positive_set: The set of positive items which should not be included as
-      negatives.
-    n: The number of negatives to generate.
-    replacement: Whether to sample with (True) or without (False) replacement.

-  Returns:
-    A list of generated negatives.
-  """
+def very_slightly_biased_randint(max_val_vector):
+  sample_dtype = np.uint64
+  out_dtype = max_val_vector.dtype
+  samples = np.random.randint(low=0, high=np.iinfo(sample_dtype).max,
+                              size=max_val_vector.shape, dtype=sample_dtype)
+  return np.mod(samples, max_val_vector.astype(sample_dtype)).astype(out_dtype)

-  if not isinstance(positive_set, set):
-    positive_set = set(positive_set)
-
-  p = 1 - len(positive_set) /  num_items
-  n_attempt = int(n * (1 / p) * 1.2)  # factor of 1.2 for safety
-
-  # If sampling is performed with replacement, candidates are appended.
-  # Otherwise, they should be added with a set union to remove duplicates.
-  if replacement:
-    negatives = []
-  else:
-    negatives = set()
-
-  while len(negatives) < n:
-    negative_candidates = np.random.randint(
-        low=0, high=num_items, size=(n_attempt,))
-    if replacement:
-      negatives.extend(
-          [i for i in negative_candidates if i not in positive_set]
-      )
-    else:
-      negatives |= (set(negative_candidates) - positive_set)
-
-  if not replacement:
-    negatives = list(negatives)
-    np.random.shuffle(negatives)  # list(set(...)) is not order guaranteed, but
-    # in practice tends to be quite ordered.
-
-  return negatives[:n]

 def mask_duplicates(x, axis=1):  # type: (np.ndarray, int) -> np.ndarray
  """Identify duplicates from sampling with replacement.

--- a/official/requirements.txt
+++ b/official/requirements.txt
@@ -7,4 +7,5 @@ oauth2client>=4.1.2
 pandas
 psutil>=5.4.3
 py-cpuinfo>=3.3.0
+scipy
 typing
--- a/official/utils/logs/mlperf_helper.py
+++ b/official/utils/logs/mlperf_helper.py
@@ -34,7 +34,7 @@ import typing

 import tensorflow as tf

-_MIN_VERSION = (0, 0, 6)
+_MIN_VERSION = (0, 0, 10)
 _STACK_OFFSET = 2

 SUDO = "sudo" if os.geteuid() else ""
@@ -186,60 +186,6 @@ def clear_system_caches():
    raise ValueError("Failed to clear caches")


-def stitch_ncf():
-  """Format NCF logs for MLPerf compliance."""
-  if not LOGGER.enabled:
-    return
-
-  if LOGGER.log_file is None or not tf.gfile.Exists(LOGGER.log_file):
-    tf.logging.warning("Could not find log file to stitch.")
-    return
-
-  log_lines = []
-  num_eval_users = None
-  start_time = None
-  stop_time = None
-  with tf.gfile.Open(LOGGER.log_file, "r") as f:
-    for line in f:
-      parsed_line = parse_line(line)
-      if not parsed_line:
-        tf.logging.warning("Failed to parse line: {}".format(line))
-        continue
-      log_lines.append(parsed_line)
-
-      if parsed_line.tag == TAGS.RUN_START:
-        assert start_time is None
-        start_time = float(parsed_line.timestamp)
-
-      if parsed_line.tag == TAGS.RUN_STOP:
-        assert stop_time is None
-        stop_time = float(parsed_line.timestamp)
-
-      if (parsed_line.tag == TAGS.EVAL_HP_NUM_USERS and parsed_line.value
-          is not None and "DEFERRED" not in parsed_line.value):
-        assert num_eval_users is None or num_eval_users == parsed_line.value
-        num_eval_users = parsed_line.value
-        log_lines.pop()
-
-  for i, parsed_line in enumerate(log_lines):
-    if parsed_line.tag == TAGS.EVAL_HP_NUM_USERS:
-      log_lines[i] = ParsedLine(*parsed_line[:-1], value=num_eval_users)
-
-  log_lines = sorted([unparse_line(i) for i in log_lines])
-
-  output_path = os.getenv("STITCHED_COMPLIANCE_FILE", None)
-  if output_path:
-    with tf.gfile.Open(output_path, "w") as f:
-      for line in log_lines:
-        f.write(line + "\n")
-  else:
-    for line in log_lines:
-      print(line)
-    sys.stdout.flush()
-
-  if start_time is not None and stop_time is not None:
-    tf.logging.info("MLPerf time: {:.1f} sec.".format(stop_time - start_time))
-
 if __name__ == "__main__":
  tf.logging.set_verbosity(tf.logging.INFO)
  with LOGGER(True):