Checkin seq_flow_lite (#10219)

fec0338f · pyoung2778 · GitHub · c6d7d57d · fec0338f · fec0338f
38 changed file
--- a/research/seq_flow_lite/WORKSPACE
+++ b/research/seq_flow_lite/WORKSPACE
@@ -16,10 +16,10 @@ http_archive(

 http_archive(
    name = "org_tensorflow",
-    sha256 = "fc6d7c57cd9427e695a38ad00fb6ecc3f623bac792dd44ad73a3f85b338b68be",
-    strip_prefix = "tensorflow-8a4ffe2e1ae722cff5306778df0cfca8b7f503fe",
+    sha256 = "40d3203ab5f246d83bae328288a24209a2b85794f1b3e2cd0329458d8e7c1985",
+    strip_prefix = "tensorflow-2.6.0",
    urls = [
-        "https://github.com/tensorflow/tensorflow/archive/8a4ffe2e1ae722cff5306778df0cfca8b7f503fe.tar.gz",
+        "https://github.com/tensorflow/tensorflow/archive/v2.6.0.zip",
    ],
 )

@@ -49,41 +49,6 @@ PROTOC_VERSION = "3.9.0"
 PROTOC_SHA256 = "15e395b648a1a6dda8fd66868824a396e9d3e89bc2c8648e3b9ab9801bea5d55"
 reverb_protoc_deps(version = PROTOC_VERSION, sha256 = PROTOC_SHA256)

-# ABSL cpp library.
-http_archive(
-    name = "com_google_absl",
-    sha256 = "f368a8476f4e2e0eccf8a7318b98dafbe30b2600f4e3cf52636e5eb145aba06a",  # SHARED_ABSL_SHA
-    strip_prefix = "abseil-cpp-df3ea785d8c30a9503321a3d35ee7d35808f190d",
-    urls = [
-        "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/df3ea785d8c30a9503321a3d35ee7d35808f190d.tar.gz",
-        "https://github.com/abseil/abseil-cpp/archive/df3ea785d8c30a9503321a3d35ee7d35808f190d.tar.gz",
-    ],
-)
-
-http_archive(
-    name = "rules_cc",
-    strip_prefix = "rules_cc-master",
-    urls = ["https://github.com/bazelbuild/rules_cc/archive/master.zip"],
-)
-
-# GoogleTest/GoogleMock framework. Used by most unit-tests.
-http_archive(
-     name = "com_google_googletest",
-     urls = ["https://github.com/google/googletest/archive/master.zip"],
-     strip_prefix = "googletest-master",
-)
-
-# gflags needed by glog
-http_archive(
-    name = "com_github_gflags_gflags",
-    sha256 = "6e16c8bc91b1310a44f3965e616383dbda48f83e8c1eaa2370a215057b00cabe",
-    strip_prefix = "gflags-77592648e3f3be87d6c7123eb81cbad75f9aef5a",
-    urls = [
-        "https://mirror.bazel.build/github.com/gflags/gflags/archive/77592648e3f3be87d6c7123eb81cbad75f9aef5a.tar.gz",
-        "https://github.com/gflags/gflags/archive/77592648e3f3be87d6c7123eb81cbad75f9aef5a.tar.gz",
-    ],
-)
-
 # glog
 http_archive(
    name = "com_google_glog",
@@ -92,16 +57,6 @@ http_archive(
    urls = ["https://github.com/google/glog/archive/v0.4.0.tar.gz"],
 )

-http_archive(
-    name = "absl_py",
-    sha256 = "603febc9b95a8f2979a7bdb77d2f5e4d9b30d4e0d59579f88eba67d4e4cc5462",
-    strip_prefix = "abseil-py-pypi-v0.9.0",
-    urls = [
-        "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-py/archive/pypi-v0.9.0.tar.gz",
-        "https://github.com/abseil/abseil-py/archive/pypi-v0.9.0.tar.gz",
-    ],
-)
-
 http_archive(
    name = "utf_archive",
    build_file = "@//third_party:utf.BUILD",
@@ -113,25 +68,17 @@ http_archive(
 )


-#-----------------------------------------------------------------------------
-# proto
-#-----------------------------------------------------------------------------
-# proto_library, cc_proto_library and java_proto_library rules implicitly depend
-# on @com_google_protobuf//:proto, @com_google_protobuf//:cc_toolchain and
-# @com_google_protobuf//:java_toolchain, respectively.
-# This statement defines the @com_google_protobuf repo.
-http_archive(
-    name = "com_google_protobuf",
-    strip_prefix = "protobuf-3.8.0",
-    urls = ["https://github.com/google/protobuf/archive/v3.8.0.zip"],
-    sha256 = "1e622ce4b84b88b6d2cdf1db38d1a634fe2392d74f0b7b74ff98f3a51838ee53",
-)
+load("@org_tensorflow//tensorflow:workspace3.bzl", "tf_workspace3")
+tf_workspace3()
+
+load("@org_tensorflow//tensorflow:workspace2.bzl", "tf_workspace2")
+tf_workspace2()

-load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo")
-flatbuffers()
+load("@org_tensorflow//tensorflow:workspace1.bzl", "tf_workspace1")
+tf_workspace1()

-load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace")
-tf_workspace(tf_repo_name = "org_tensorflow")
+load("@org_tensorflow//tensorflow:workspace0.bzl", "tf_workspace0")
+tf_workspace0()


 # TF submodule compilation doesn't take care of grpc deps. Do it manually here.
@@ -168,7 +115,7 @@ new_git_repository(
  remote = "https://github.com/unicode-org/icu",
  build_file = "@//third_party:icu.BUILD",
  patch_cmds = [
-    "find . -type f -exec sed -i 's/#\s*include \"unicode/#include \"icu4c\/source\/common\/unicode/g' {} \;",
+    "find . -type f -exec sed -i 's/#\\s*include \"unicode/#include \"icu4c\\/source\\/common\\/unicode/g' {} \\;",
  ],
 )


--- a/research/seq_flow_lite/demo/colab/BUILD
+++ b/research/seq_flow_lite/demo/colab/BUILD
@@ -5,5 +5,6 @@ sh_binary(
        "//tf_ops:sequence_string_projection_op_py",
        "//tf_ops:sequence_string_projection_op_v2_py",
        "//tf_ops:tf_custom_ops_py",
+	"//tflite_ops:registerer",
    ],
 )
--- a/research/seq_flow_lite/demo/colab/move_ops.sh
+++ b/research/seq_flow_lite/demo/colab/move_ops.sh
@@ -30,3 +30,5 @@ cp -f "${RUNFILES_DIR}/tf_ops/libtf_custom_ops_py_gen_op.so" \
 cp -f "${RUNFILES_DIR}/tf_ops/tf_custom_ops_py.py" \
  "${BUILD_WORKSPACE_DIRECTORY}/tf_ops"

+cp -f "${RUNFILES_DIR}/tflite_ops/registerer.so" \
+  "${BUILD_WORKSPACE_DIRECTORY}/tflite_ops"
--- a/research/seq_flow_lite/demo/colab/setup.py
+++ b/research/seq_flow_lite/demo/colab/setup.py
@@ -44,7 +44,7 @@ class _BazelBuildCommand(setuptools.Command):
 setuptools.setup(
    name='seq_flow_lite',
    version='0.1',
-    packages=['tf_ops'],
+    packages=['tf_ops', 'tflite_ops'],
    package_data={'': ['*.so']},
    cmdclass={
        'build': _BuildCommand,

--- a/research/seq_flow_lite/demo/prado/prado_tflite_example.cc
+++ b/research/seq_flow_lite/demo/prado/prado_tflite_example.cc
@@ -48,9 +48,9 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
  tflite::ops::builtin::BuiltinOpResolver resolver;
  resolver.AddCustom(
      "SEQUENCE_STRING_PROJECTION",
-      tflite::ops::custom::Register_SEQUENCE_STRING_PROJECTION());
+      ::seq_flow_lite::ops::custom::Register_SEQUENCE_STRING_PROJECTION());
  resolver.AddCustom("ExpectedValueOp",
-                     tflite::ops::custom::Register_EXPECTED_VALUE());
+                     ::seq_flow_lite::ops::custom::Register_EXPECTED_VALUE());
  tflite::InterpreterBuilder(model, resolver,
                             /*error_reporter=*/nullptr)(&interpreter);
  if (!interpreter) {
@@ -105,7 +105,7 @@ std::vector<float> InvokeModel(
  const size_t num_classes = output_dims[kClassOutputClassIndex];
  for (int i = 0; i < num_classes; ++i) {
    // Find class probability or log probability for the class index
-    classes.push_back(tflite::PodDequantize(*class_output, i));
+    classes.push_back(::seq_flow_lite::PodDequantize(*class_output, i));
  }
  return classes;
 }

--- a/research/seq_flow_lite/export_to_tflite.py
+++ b/research/seq_flow_lite/export_to_tflite.py
@@ -30,6 +30,8 @@ from utils import tflite_utils # import seq_flow_lite module
 FLAGS = flags.FLAGS

 flags.DEFINE_string("output_dir", None, "The output or model directory.")
+flags.DEFINE_enum("output", "sigmoid", ["logits", "sigmoid", "softmax"],
+                  "Specification of the output tensor.")


 def load_runner_config():
@@ -51,12 +53,20 @@ def main(_):
      encoder = model.Encoder(model_config, base_layers.TFLITE)
      projection, seq_lengh = prxlayer(text)
      logits = encoder(projection, seq_lengh)
+      if FLAGS.output == "logits":
+        outputs = logits
+      elif FLAGS.output == "sigmoid":
+        outputs = tf.math.sigmoid(logits)
+      else:
+        assert FLAGS.output == "softmax", "Unexpected output"
+        outputs = tf.nn.softmax(logits)

      session.run(tf.global_variables_initializer())
      session.run(tf.local_variables_initializer())
      saver = tf.train.Saver()
      saver.restore(session, tf.train.latest_checkpoint(FLAGS.output_dir))
-      tflite_fb = tflite_utils.generate_tflite(session, graph, [text], [logits])
+      tflite_fb = tflite_utils.generate_tflite(session, graph, [text],
+                                               [outputs])
      output_file_name = os.path.join(FLAGS.output_dir, "tflite.fb")
      with tf.gfile.Open(output_file_name, "wb") as f:
        f.write(tflite_fb)

--- a/research/seq_flow_lite/layers/base_layers.py
+++ b/research/seq_flow_lite/layers/base_layers.py
@@ -54,19 +54,23 @@ class BaseLayer(tf.keras.layers.Layer):
    assert len(tensor.get_shape().as_list()) == rank
    assert tensor.dtype == dtype

-  def add_qweight(self, shape, num_bits=8):
-    """Return a quantized weight variable for the given shape."""
+  def add_weight_wrapper(self, shape):
+    """Return a weight variable for the given shape."""
    if self.parameters.initializer is not None:
      initializer = self.parameters.initializer
    else:
      initializer = tf.keras.initializers.GlorotUniform()
    weight = self.add_weight(
-        "weight", shape, initializer=initializer, trainable=True)
+        "weight",
+        shape,
+        initializer=initializer,
+        trainable=True,
+        dtype=tf.float32)
    self.add_reg_loss(weight)
-    return self._weight_quantization(weight, num_bits=num_bits)
+    return weight

-  def _weight_quantization(self, tensor, num_bits=8):
-    """Quantize weights when enabled."""
+  def quantize_parameter(self, tensor, num_bits=8):
+    """Quantize parameters when enabled."""
    # For infer mode, toco computes the min/max from the weights offline to
    # quantize it. During train/eval this is computed from the current value
    # in the session by the graph itself.
@@ -98,21 +102,37 @@ class BaseLayer(tf.keras.layers.Layer):
  def assign_moving_average(self, var, update, ema_decay):
    return var.assign(var.read_value() * (1 - ema_decay) + (ema_decay) * update)

-  def qrange_sigmoid(self, tensor):
-    if self.parameters.quantize:
+  def quantize_tensor(self, tf_only):
+    if tf_only and self.parameters.mode == TFLITE:
+      return False
+    return self.parameters.quantize
+
+  def qrange_sigmoid(self, tensor, tf_only=False):
+    if self.quantize_tensor(tf_only):
      return tf.quantization.fake_quant_with_min_max_args(tensor, 0.0, 1.0)
    return tensor

-  def qrange_tanh(self, tensor):
-    if self.parameters.quantize:
+  def qrange_tanh(self, tensor, tf_only=False):
+    if self.quantize_tensor(tf_only):
      return tf.quantization.fake_quant_with_min_max_args(tensor, -1.0, 1.0)
    return tensor

-  def quantized_tanh(self, tensor):
-    return self.qrange_tanh(tf.tanh(tensor))
+  def quantized_tanh(self, tensor, tf_only=False):
+    return self.qrange_tanh(tf.tanh(tensor), tf_only)

-  def quantized_sigmoid(self, tensor):
-    return self.qrange_sigmoid(tf.sigmoid(tensor))
+  def quantized_sigmoid(self, tensor, tf_only=False):
+    return self.qrange_sigmoid(tf.sigmoid(tensor), tf_only)

  def get_batch_dimension(self, tensor):
    return tensor.get_shape().as_list()[0] or tf.shape(tensor)[0]
+
+  def inverse_normalizer(self, mask):
+    return tf.math.reciprocal(tf.reduce_sum(mask))
+
+  def random_drop_to_zero(self, tensor, zero_probability):
+    rnd = tf.random.uniform(
+        shape=tf.shape(tensor),
+        minval=-zero_probability,
+        maxval=(1.0 - zero_probability),
+        dtype=tensor.dtype)
+    return tf.math.ceil(rnd)
--- a/research/seq_flow_lite/layers/conv_layers.py
+++ b/research/seq_flow_lite/layers/conv_layers.py
@@ -60,7 +60,7 @@ class EncoderQConvolution(base_layers.BaseLayer):
    assert len(input_shapes) == self.rank
    self.in_filters = input_shapes[-1]
    shape = self.ksize + [self.in_filters, self.out_filters]
-    self.filters = self.add_qweight(shape=shape)
+    self.filters = self.add_weight_wrapper(shape=shape)
    if self.bias:
      self.b = self.add_bias(shape=[self.out_filters])

@@ -70,7 +70,7 @@ class EncoderQConvolution(base_layers.BaseLayer):
  def _conv_r4(self, inputs, normalize_method):
    outputs = tf.nn.conv2d(
        inputs,
-        self.filters,
+        self.quantize_parameter(self.filters),
        strides=self.strides,
        padding=self.padding,
        dilations=self.dilations)

--- a/research/seq_flow_lite/layers/dense_layers.py
+++ b/research/seq_flow_lite/layers/dense_layers.py
@@ -47,7 +47,7 @@ class BaseQDense(base_layers.BaseLayer):
      assert input_shapes[1] == 1 or input_shapes[2] == 1
    self.in_units = input_shapes[-1]
    shape = [self.in_units, self.units]
-    self.w = self.add_qweight(shape=shape)
+    self.w = self.add_weight_wrapper(shape=shape)
    if self.bias:
      self.b = self.add_bias(shape=[self.units])

@@ -55,7 +55,7 @@ class BaseQDense(base_layers.BaseLayer):
    self.normalization = normalization_layers.BatchNormalization(**kwargs)

  def _dense_r2(self, inputs, normalize_method):
-    outputs = tf.matmul(inputs, self.w)
+    outputs = tf.matmul(inputs, self.quantize_parameter(self.w))
    if self.bias:
      outputs = tf.nn.bias_add(outputs, self.b)
    if self.normalize:
@@ -98,7 +98,9 @@ class BaseQDenseVarLen(BaseQDense):
    self.normalization = normalization_layers.VarLenBatchNormalization(
        rank=2, **kwargs)

-  def call(self, inputs, mask, inverse_normalizer):
+  def call(self, inputs, mask, inverse_normalizer=None):
+    if inverse_normalizer is None:
+      inverse_normalizer = self.inverse_normalizer(mask)

    def normalize_method(tensor):
      maskr2 = tf.reshape(mask, [-1, 1])

--- a/research/seq_flow_lite/layers/projection_layers.py
+++ b/research/seq_flow_lite/layers/projection_layers.py
@@ -25,7 +25,7 @@ from tf_ops import sequence_string_projection_op_v2 as sspv2 # import seq_flow_l
 class ProjectionLayer(base_layers.BaseLayer):
  """Base class for encoders."""

-  def __init__(self, model_config, mode):
+  def __init__(self, model_config, mode, **kwargs):
    """Create projection."""

    def _get_params(varname, default_value=None):
@@ -50,7 +50,7 @@ class ProjectionLayer(base_layers.BaseLayer):
    if mode == base_layers.TRAIN:
      _get_params("distortion_probability", 0.0)
    parameters = base_layers.Parameters(mode, self.quantize)
-    super(ProjectionLayer, self).__init__(parameters=parameters)
+    super(ProjectionLayer, self).__init__(parameters=parameters, **kwargs)

  def call(self, inputs):
    projection, _, seq_length = ssp.sequence_string_projection(
@@ -74,15 +74,14 @@ class ProjectionLayer(base_layers.BaseLayer):
      batch_size = self.get_batch_dimension(inputs)
      projection = tf.reshape(projection,
                              [batch_size, self.max_seq_len, self.feature_size])
-    if self.mode in modes:
-      projection = self.qrange_tanh(projection)
+    projection = self.qrange_tanh(projection)
    return projection, seq_length


 class ProjectionLayerPreSegmented(base_layers.BaseLayer):
  """Base class for encoders."""

-  def __init__(self, model_config, mode):
+  def __init__(self, model_config, mode, **kwargs):
    """Create projection."""

    def _get_params(varname, default_value=None):
@@ -101,11 +100,13 @@ class ProjectionLayerPreSegmented(base_layers.BaseLayer):
    if mode == base_layers.TRAIN:
      _get_params("distortion_probability", 0.0)
    parameters = base_layers.Parameters(mode, self.quantize)
-    super(ProjectionLayerPreSegmented, self).__init__(parameters=parameters)
+    super(ProjectionLayerPreSegmented, self).__init__(
+        parameters=parameters, **kwargs)

-  def call(self, inputs, sequence_length):
+  def call(self, inputs):
+    tokens, sequence_length = inputs
    projection = sspv2.sequence_string_projection_v2(
-        input=inputs,
+        input=tokens,
        sequence_length=sequence_length,
        feature_size=self.feature_size,
        distortion_probability=self.distortion_probability,

--- a/research/seq_flow_lite/layers/quantization_layers.py
+++ b/research/seq_flow_lite/layers/quantization_layers.py
@@ -27,6 +27,8 @@ class ActivationQuantization(base_layers.BaseLayer):
    self.ema_decay = ema_decay
    self.num_bits = num_bits
    super(ActivationQuantization, self).__init__(**kwargs)
+
+  def build(self, input_shapes):
    if self.parameters.quantize:
      self.min_var = self.add_weight(
          "min", initializer=tf.keras.initializers.Zeros(), trainable=False)
@@ -53,6 +55,7 @@ class ActivationQuantization(base_layers.BaseLayer):
    return inputs

  def quantize_using_range(self, inputs):
+    # This method can only be called after a call to "call" method in this class
    if self.parameters.quantize:
      return tf.quantization.fake_quant_with_min_max_vars(
          inputs, self.min_var, self.max_var, num_bits=self.num_bits)
@@ -66,21 +69,24 @@ class ConcatQuantization(ActivationQuantization):
    self.axis = axis
    super(ConcatQuantization, self).__init__(**kwargs)

-  def reduce_list(self, tensor_list, functor):
+  def _reduce_list(self, tensor_list, functor):
    reduce_result = [functor(tensor) for tensor in tensor_list]
    # Toco expects 0.0 to be part of the quantization range.
    reduce_result.append(tf.constant(0.0))
    return functor(tf.stack(reduce_result))

  def call(self, tensors):
+    # Ignore empty invocations done to build the keras layer.
+    if tensors is None:
+      return
    if self.parameters.quantize:
      if self.parameters.mode == base_layers.TRAIN:
        # Toco expects 0.0 to be part of the quantization range.
-        batch_min = self.reduce_list(tensors, tf.reduce_min)
+        batch_min = self._reduce_list(tensors, tf.reduce_min)
        min_var = self.assign_moving_average(self.min_var, batch_min,
                                             self.ema_decay)

-        batch_max = self.reduce_list(tensors, tf.reduce_max)
+        batch_max = self._reduce_list(tensors, tf.reduce_max)
        max_var = self.assign_moving_average(self.max_var, batch_max,
                                             self.ema_decay)
      else:

--- a/research/seq_flow_lite/metric_functions.py
+++ b/research/seq_flow_lite/metric_functions.py
@@ -27,21 +27,17 @@ def classification_metric(per_example_loss, label_ids, logits):
  }


-THRESHOLDS = [0.5]
-
-
 def labeling_metric(per_example_loss, label_ids, logits):
  """Compute eval metrics."""
  scores = tf.math.sigmoid(logits)
+  binary_prediction = tf.math.greater_equal(scores, 0.5)
  num_classes = label_ids.get_shape().as_list()[-1]
  return_dict = {"eval_loss": tf.metrics.mean(per_example_loss)}
  for idx in range(num_classes):
    return_dict["auc/" + str(idx)] = tf.metrics.auc(label_ids[:, idx],
                                                    scores[:, idx])
-    return_dict["precision@" + str(THRESHOLDS) + "/" +
-                str(idx)] = tf.metrics.precision_at_thresholds(
-                    label_ids[:, idx], scores[:, idx], thresholds=THRESHOLDS)
-    return_dict["recall@" + str(THRESHOLDS) + "/" +
-                str(idx)] = tf.metrics.recall_at_thresholds(
-                    label_ids[:, idx], scores[:, idx], thresholds=THRESHOLDS)
+    return_dict["precision/" + str(idx)] = tf.metrics.precision(
+        label_ids[:, idx], binary_prediction[:, idx])
+    return_dict["recall/" + str(idx)] = tf.metrics.recall(
+        label_ids[:, idx], binary_prediction[:, idx])
  return return_dict
--- a/research/seq_flow_lite/models/prado.py
+++ b/research/seq_flow_lite/models/prado.py
@@ -38,6 +38,7 @@ class PaddedMaskedVarLenConv(conv_layers.EncoderQConvolutionVarLen):
    assert bool(ngram is None) != bool(skip_bigram is None)
    self.kwidth = ngram if ngram is not None else (skip_bigram + 2)
    mask = [1] * self.kwidth
+    self.skipgram = skip_bigram is not None
    if skip_bigram is not None:
      mask[1], mask[skip_bigram] = 0, 0
    self.mask = np.array(mask, dtype="float32").reshape((1, self.kwidth, 1, 1))
@@ -56,10 +57,10 @@ class PaddedMaskedVarLenConv(conv_layers.EncoderQConvolutionVarLen):
      return result * mask + (1 - mask) * self.invalid_value
    return result

-  def add_qweight(self, shape, num_bits=8):
-    weight = super(PaddedMaskedVarLenConv, self).add_qweight(
-        shape=shape, num_bits=num_bits)
-    return weight * tf.convert_to_tensor(self.mask)
+  def quantize_parameter(self, weight, num_bits=8):
+    weight = super(PaddedMaskedVarLenConv, self).quantize_parameter(
+        weight, num_bits=num_bits)
+    return weight * tf.convert_to_tensor(self.mask) if self.skipgram else weight


 class AttentionPoolReduce(base_layers.BaseLayer):
@@ -97,8 +98,8 @@ class AttentionPoolReduce(base_layers.BaseLayer):
 class Encoder(tf.keras.layers.Layer):
  """A PRADO keras model."""

-  def __init__(self, config, mode):
-    super(Encoder, self).__init__()
+  def __init__(self, config, mode, **kwargs):
+    super(Encoder, self).__init__(**kwargs)

    def _get_params(varname, default_value=None):
      value = config[varname] if varname in config else default_value
@@ -118,7 +119,7 @@ class Encoder(tf.keras.layers.Layer):
    _get_params("skip1bigram_channels", 0)
    _get_params("skip2bigram_channels", 0)
    _get_params("network_regularizer_scale", 1e-4)
-    _get_params("keep_prob", 0.5)
+    _get_params("keep_prob", 1.0)
    self.num_classes = len(self.labels)

    self.parameters = base_layers.Parameters(
@@ -129,7 +130,6 @@ class Encoder(tf.keras.layers.Layer):
        units=self.embedding_size, rank=3, parameters=self.parameters)
    self.attention_fc = dense_layers.BaseQDenseVarLen(
        units=self.embedding_size, rank=3, parameters=self.parameters)
-    self.dropout = tf.keras.layers.Dropout(rate=(1 - self.keep_prob))

    self.parameters = copy.copy(self.parameters)
    self.parameters.regularizer_scale = self.network_regularizer_scale
@@ -161,8 +161,8 @@ class Encoder(tf.keras.layers.Layer):

  def _apply_fc_dropout(self, layer, inputs, mask, inverse_normalizer):
    outputs = layer(inputs, mask, inverse_normalizer)
-    if self.parameters.mode == base_layers.TRAIN:
-      return self.dropout(outputs)
+    if self.parameters.mode == base_layers.TRAIN and self.keep_prob < 1.0:
+      return tf.nn.dropout(outputs, rate=(1 - self.keep_prob))
    return outputs

  def call(self, projection, seq_length):
@@ -178,14 +178,17 @@ class Encoder(tf.keras.layers.Layer):
        layer(values_in, attention_in, maskr3, inverse_normalizer)
        for layer in self.attention_pool_layers
    ]
+
+    assert tensors, "no ngram channels have been configured"
+
    pre_logits = self.concat_quantizer(tensors)
    return self.final_fc(pre_logits)


 class Model(Encoder):

-  def __init__(self, config, mode):
-    super(Model, self).__init__(config, mode)
+  def __init__(self, config, mode, **kwargs):
+    super(Model, self).__init__(config, mode, **kwargs)
    self.projection = projection_layers.ProjectionLayer(config, mode)

  def call(self, inputs):

--- a/research/seq_flow_lite/models/sgnn/BUILD
+++ b/research/seq_flow_lite/models/sgnn/BUILD
@@ -93,8 +93,8 @@ py_binary(
        # Expect numpy installed
        # package TFLite flex delegate
        # package TFLite interpreter
-        "@org_tflite_support//tensorflow_lite_support/custom_ops/kernel:ngrams_op_resolver",
-        "@org_tflite_support//tensorflow_lite_support/custom_ops/kernel:whitespace_tokenizer_op_resolver",
+        "@org_tflite_support//tensorflow_lite_support/custom_ops/kernel:_pywrap_ngrams_op_resolver",
+        "@org_tflite_support//tensorflow_lite_support/custom_ops/kernel:_pywrap_whitespace_tokenizer_op_resolver",
        # Expect tensorflow text installed
    ],
 )

--- a/research/seq_flow_lite/tf_ops/BUILD
+++ b/research/seq_flow_lite/tf_ops/BUILD
@@ -10,15 +10,6 @@ package(
    ],
 )

-py_library(
-    name = "text_projection",
-    srcs = ["text_projection.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":sequence_string_projection_op_py",
-    ],
-)
-
 cc_library(
    name = "sequence_string_projection_op",
    srcs = [
@@ -30,7 +21,6 @@ cc_library(
        ":projection_util",
        ":text_distorter",
        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/random",
        "@tensorflow_includes//:includes",
        "@tensorflow_solib//:framework_lib",
    ],
@@ -71,11 +61,9 @@ cc_library(
    srcs = ["text_distorter.cc"],
    hdrs = ["text_distorter.h"],
    deps = [
-        "@com_google_absl//absl/strings",
        "@icu4c",
        "@tensorflow_includes//:includes",
        "@tensorflow_solib//:framework_lib",
-        "@utf_archive//:utf",
    ],
 )

@@ -102,7 +90,6 @@ cc_library(
        "@tensorflow_includes//:includes",
        "@tensorflow_solib//:framework_lib",
        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/random",
    ],
    alwayslink = 1,
 )

--- a/research/seq_flow_lite/tf_ops/projection_normalizer_util.cc
+++ b/research/seq_flow_lite/tf_ops/projection_normalizer_util.cc
@@ -79,7 +79,7 @@ std::string ContractToken(const char* input_ptr, size_t len, size_t num_chars) {

    // Count how many times this pattern appeared.
    int num_cur_patterns = 0;
-    if (cur_pattern.find(" ") == std::string::npos && !IsDigit(cur_pattern)) {
+    if (cur_pattern.find(' ') == std::string::npos && !IsDigit(cur_pattern)) {
      num_cur_patterns =
          GetNumPattern(char_tokens, i + num_chars, num_chars, cur_pattern);
    }

--- a/research/seq_flow_lite/tf_ops/projection_util.cc
+++ b/research/seq_flow_lite/tf_ops/projection_util.cc
@@ -25,25 +25,28 @@ limitations under the License.
 namespace {
 constexpr int kInvalid = -1;
 constexpr char kSpace = ' ';
-}  // namespace

+// A HashEngine that uses MurmurHash to convert text to hashcodes.
 class MurmurHash : public HashEngine {
 public:
-  void GetHashCodes(const std::string& word, std::vector<uint64_t>* hash_codes,
-                    int feature_size) override {
+  std::vector<uint64_t> GetHashCodes(const std::string& word,
+                                     int feature_size) override {
+    std::vector<uint64_t> hash_codes;
+    hash_codes.reserve(2 * (feature_size / 64 + 1));
    uint64_t hash_low = 0;
    uint64_t hash_high = 0;
    for (int i = 0; i < feature_size; i += 64) {
      if (i == 0) {
-        auto hash = MurmurHash128(word.c_str(), word.size());
+        auto hash = MurmurHash128(word.data(), word.size());
        hash_low = hash.first;
        hash_high = hash.second;
      } else {
        GetMoreBits(hash_low, hash_high, &hash_low, &hash_high);
      }
-      hash_codes->push_back(hash_low);
-      hash_codes->push_back(hash_high);
+      hash_codes.push_back(hash_low);
+      hash_codes.push_back(hash_high);
    }
+    return hash_codes;
  }

 private:
@@ -78,7 +81,7 @@ class MurmurHash : public HashEngine {
  std::pair<uint64_t, uint64_t> MurmurHash128(const char* buf,
                                              const size_t len) {
    // Initialize the hashing value.
-    uint64_t hash = len * kMul;
+    uint64_t hash1 = len * kMul;
    // hash2 will be xored by hash during the hash computation iterations.
    // In the end we use an alternative mixture multiplier for mixing
    // the bits in hash2.
@@ -90,34 +93,38 @@ class MurmurHash : public HashEngine {

    for (const char* p = buf; p != end; p += 8) {
      // Manually unrolling this loop 2x did not help on Intel Core 2.
-      hash = MurmurStep(hash, Load64VariableLength(p, 8));
-      hash2 ^= hash;
+      hash1 = MurmurStep(hash1, Load64VariableLength(p, 8));
+      hash2 ^= hash1;
    }
    if ((len & 0x7) != 0) {
      const uint64_t data = Load64VariableLength(end, len & 0x7);
-      hash ^= data;
-      hash *= kMul;
-      hash2 ^= hash;
+      hash1 ^= data;
+      hash1 *= kMul;
+      hash2 ^= hash1;
    }
-    hash = ShiftMix(hash) * kMul;
-    hash2 ^= hash;
-    hash = ShiftMix(hash);
+    hash1 = ShiftMix(hash1) * kMul;
+    hash2 ^= hash1;
+    hash1 = ShiftMix(hash1);

    // mul2 is a prime just above golden ratio. mul2 is used to ensure that the
    // impact of the last few bytes is different to the upper and lower 64 bits.
    hash2 = ShiftMix(hash2 * kMul2) * kMul2;

-    return std::make_pair(hash, hash2);
+    return {hash1, hash2};
  }
 };

+// A HashEngine that uses a prefix and suffix preserving hash to convert text
+// to hashcodes.
 class XFixHash : public HashEngine {
 public:
  explicit XFixHash(int bits_per_char)
      : bits_per_char_(bits_per_char), bit_mask_((1ULL << bits_per_char) - 1) {}

-  void GetHashCodes(const std::string& word, std::vector<uint64_t>* hash_codes,
-                    int feature_size) override {
+  std::vector<uint64_t> GetHashCodes(const std::string& word,
+                                     int feature_size) override {
+    std::vector<uint64_t> hash_codes;
+    hash_codes.reserve(2 * (feature_size / 64 + 1));
    auto token_ptr = reinterpret_cast<const uint8_t*>(word.c_str());
    size_t token_size = word.size();
    int token_idx = 0;
@@ -134,9 +141,10 @@ class XFixHash : public HashEngine {
        hash_low = (hash_low << bits_per_char_) | (frhash & bit_mask_);
        hash_high = (hash_high << bits_per_char_) | (brhash & bit_mask_);
      }
-      hash_codes->push_back(hash_low);
-      hash_codes->push_back(hash_high);
+      hash_codes.push_back(hash_low);
+      hash_codes.push_back(hash_high);
    }
+    return hash_codes;
  }

 private:
@@ -146,6 +154,8 @@ class XFixHash : public HashEngine {
  const uint64_t bit_mask_;
 };

+// A HashEngine that performs a position preserving unicode level hashing to
+// convert text to hashcodes.
 class UnicodeHash : public HashEngine {
 public:
  // bits_per_unicode should be a divisor of 64.
@@ -154,8 +164,10 @@ class UnicodeHash : public HashEngine {
        bit_mask_(((1ULL << bits_per_unicode) - 1) << (64 - bits_per_unicode)) {
  }

-  void GetHashCodes(const std::string& word, std::vector<uint64_t>* hash_codes,
-                    int feature_size) override {
+  std::vector<uint64_t> GetHashCodes(const std::string& word,
+                                     int feature_size) override {
+    std::vector<uint64_t> hash_codes;
+    hash_codes.reserve(2 * (feature_size / 64 + 1));
    auto word_ptr = word.c_str();
    int utflength = utflen(const_cast<char*>(word_ptr));
    // Both `feature_size` and `bits_per_unicode` are bit lengths.
@@ -187,8 +199,9 @@ class UnicodeHash : public HashEngine {
          hash = hash >> bits_per_unicode_;
        }
      }
-      hash_codes->push_back(hash);
+      hash_codes.push_back(hash);
    }
+    return hash_codes;
  }

 private:
@@ -197,6 +210,8 @@ class UnicodeHash : public HashEngine {
  const uint64_t bit_mask_;
 };

+}  // namespace
+
 bool Hasher::SupportedHashType(const std::string& hash_type) {
  std::unordered_set<std::string> supported({kMurmurHash, kUnicodeHash8,
                                             kUnicodeHash16, kXfixHash8,
@@ -225,7 +240,7 @@ Hasher* Hasher::CreateHasher(int feature_size, const std::string& hash_type) {

 Hasher::Hasher(int feature_size, HashEngine* hash_engine)
    : feature_size_(feature_size), hash_engine_(hash_engine) {
-  hash_engine_->GetHashCodes(empty_string_, &null_hash_codes_, feature_size_);
+  null_hash_codes_ = hash_engine_->GetHashCodes(empty_string_, feature_size_);
 }

 std::string ProjectionUnicodeHandler::LowerCaseUTF8WithSupportedUnicodes(

--- a/research/seq_flow_lite/tf_ops/projection_util.h
+++ b/research/seq_flow_lite/tf_ops/projection_util.h
@@ -21,23 +21,26 @@ limitations under the License.

 #include "libutf/utf.h"

-constexpr int kFirstCapOffset = 3;
-constexpr int kAllCapsOffset = 4;
-constexpr int kWordNoveltyOffset = 1;
-constexpr int kDocSizeOffset = 2;
-
-const char kMurmurHash[] = "murmur";
-const char kXfixHash8[] = "xfixhash8";
-const char kXfixHash16[] = "xfixhash16";
-const char kXfixHash32[] = "xfixhash32";
-const char kUnicodeHash8[] = "unicodehash8";
-const char kUnicodeHash16[] = "unicodehash16";
-
+inline constexpr int kFirstCapOffset = 3;
+inline constexpr int kAllCapsOffset = 4;
+inline constexpr int kWordNoveltyOffset = 1;
+inline constexpr int kDocSizeOffset = 2;
+
+inline constexpr char kMurmurHash[] = "murmur";
+inline constexpr char kXfixHash8[] = "xfixhash8";
+inline constexpr char kXfixHash16[] = "xfixhash16";
+inline constexpr char kXfixHash32[] = "xfixhash32";
+inline constexpr char kUnicodeHash8[] = "unicodehash8";
+inline constexpr char kUnicodeHash16[] = "unicodehash16";
+
+// A base class that specifies the interface for a hash engine used by the
+// projection operator.
 class HashEngine {
 public:
-  virtual void GetHashCodes(const std::string& word,
-                            std::vector<uint64_t>* hash_codes,
-                            int feature_size) = 0;
+  // Takes a string token `word` and a `feature_size` (measured in bits) and
+  // returns hash codes that represent the token.
+  virtual std::vector<uint64_t> GetHashCodes(const std::string& word,
+                                             int feature_size) = 0;
  virtual ~HashEngine() {}
 };

@@ -50,13 +53,12 @@ class Hasher {
                              const std::string& hash_type = kMurmurHash);
  static bool SupportedHashType(const std::string& hash_type);
  bool GetHashCodes(const std::string& word,
-                    std::vector<uint64_t>* hash_codes) {
+                    std::vector<uint64_t>& hash_codes) {
    if (!hash_engine_) return false;
    if (word.empty()) {
-      *hash_codes = null_hash_codes_;
+      hash_codes = null_hash_codes_;
    } else {
-      hash_codes->clear();
-      hash_engine_->GetHashCodes(word, hash_codes, feature_size_);
+      hash_codes = hash_engine_->GetHashCodes(word, feature_size_);
    }
    return true;
  }
@@ -64,8 +66,13 @@ class Hasher {
 private:
  explicit Hasher(int feature_size, HashEngine* hash_engine);
  const std::string empty_string_ = "<null>";
+  // Size of the projection feature which represents the number of bits of
+  // hash codes that will be generated by this class.
  const int feature_size_;
+  // The hash engine used by this class.
  std::unique_ptr<HashEngine> hash_engine_;
+  // Hash codes for empty text is precaculated and stored below to speed
+  // up projection.
  std::vector<uint64_t> null_hash_codes_;
 };

@@ -90,7 +97,8 @@ class ProjectionUnicodeHandler {
  }

  // Performs language independent lower case and returns a string with
-  // supported unicode segments.
+  // supported unicode segments and two additional flags first_cap and all_caps
+  // which when true indicate the text is Firstcap or ALLCAPS.
  std::string LowerCaseUTF8WithSupportedUnicodes(
      const std::pair<const char*, size_t>& source, bool* first_cap = nullptr,
      bool* all_caps = nullptr) const;
@@ -126,14 +134,19 @@ class ProjectionUnicodeHandler {
                                                              int max_tokens);

 private:
-  // Parses and extracts supported unicode segments from a utf8 string.
+  // Parses and extracts supported or allowed unicode segments, also referred
+  // to as vocabulary, from a utf8 string.
  void InitializeVocabulary(const std::string& vocabulary);
+  // A variable that maps a valid Unicode rune to its index in valid character
+  // vocabulary.
  std::unordered_map<Rune, int> valid_chars_;
+  // Controls whether to exclude non-alphabetic, non-space characters from the
+  // output text.
  bool exclude_nonalphaspace_unicodes_;
 };

-static constexpr size_t kEntireString = SIZE_MAX;
-static constexpr size_t kAllTokens = SIZE_MAX;
+inline constexpr size_t kEntireString = SIZE_MAX;
+inline constexpr size_t kAllTokens = SIZE_MAX;

 std::vector<std::string> SplitBySpace(const char* input_ptr, size_t len,
                                      size_t max_input, size_t max_tokens);

--- a/research/seq_flow_lite/tf_ops/repo.bzl
+++ b/research/seq_flow_lite/tf_ops/repo.bzl
@@ -198,7 +198,7 @@ cc_library(
    ),
    includes = ["tensorflow_includes"],
    deps = [
-        "@eigen_archive//:eigen",
+        "@eigen_archive//:eigen3",
        "@protobuf_archive//:includes",
        "@zlib_includes//:includes",
        "@snappy_includes//:includes",

--- a/research/seq_flow_lite/tf_ops/sequence_string_projection.cc
+++ b/research/seq_flow_lite/tf_ops/sequence_string_projection.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tf_ops/projection_normalizer_util.h"  // seq_flow_lite
-#include "tf_ops/projection_tokenizer_util.h"  // seq_flow_lite
-#include "tf_ops/projection_util.h"  // seq_flow_lite
-#include "tf_ops/text_distorter.h"  // seq_flow_lite
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tf_ops/projection_normalizer_util.h"  // seq_flow_lite
+#include "tf_ops/projection_tokenizer_util.h"  // seq_flow_lite
+#include "tf_ops/projection_util.h"  // seq_flow_lite
+#include "tf_ops/text_distorter.h"  // seq_flow_lite

 using ::tensorflow::int32;
 using ::tensorflow::int64;
@@ -51,10 +51,11 @@ float* AllocateTensor(OpKernelContext* ctx, const std::string& tensor_name,
  return &tensor->flat<float>()(0);
 }

+// OpKernel for the sequence string projection op.
 class SequenceStringProjectionOp : public OpKernel {
 public:
  explicit SequenceStringProjectionOp(OpKernelConstruction* context)
-      : OpKernel(context) {
+      : OpKernel(context), philox_(171), generator_(&philox_) {
    OP_REQUIRES_OK(context, context->GetAttr("feature_size", &feature_size_));
    std::string hashtype;
    OP_REQUIRES_OK(context, context->GetAttr("hashtype", &hashtype));
@@ -159,7 +160,10 @@ class SequenceStringProjectionOp : public OpKernel {
      }
      const int64 seq_len =
          static_cast<int64>(bos_tag_ + words.size() + eos_tag_);
-      CHECK_GT(seq_len, 0);
+      CHECK_GT(seq_len, 0)
+          << "Projection models expect input text to have at-least one valid "
+             "token. If empty text is a valid input for your model, please set "
+             "add_bos_tag to true.";
      max_seq_len = std::max(max_seq_len, seq_len);
      words_batches.emplace_back(std::move(words));
    }
@@ -208,7 +212,7 @@ class SequenceStringProjectionOp : public OpKernel {
          CHECK_EQ(eos_tag_, 1);
          word = kEndTokenTSP;
        }
-        hasher_->GetHashCodes(word, &hash_codes);
+        hasher_->GetHashCodes(word, hash_codes);
        for (int hindex = 0, k = 0; hindex < hash_codes.size(); hindex++) {
          auto hash = hash_codes[hindex];
          for (int kmax = std::min(k + increment, feature_size_); k < kmax;) {
@@ -229,7 +233,7 @@ class SequenceStringProjectionOp : public OpKernel {
              doc_size_feature;
        }
        if (add_first_cap_feature_ > 0.0f) {
-          if (text_distorter_->BernouilleSample(add_first_cap_feature_)) {
+          if (generator_.RandFloat() <= add_first_cap_feature_) {
            projection[offset0 + feature_size_ - kFirstCapOffset] =
                first_cap ? 1.0 : -1.0;
          } else {
@@ -237,7 +241,7 @@ class SequenceStringProjectionOp : public OpKernel {
          }
        }
        if (add_all_caps_feature_ > 0.0f) {
-          if (text_distorter_->BernouilleSample(add_all_caps_feature_)) {
+          if (generator_.RandFloat() <= add_all_caps_feature_) {
            projection[offset0 + feature_size_ - kAllCapsOffset] =
                all_caps ? 1.0 : -1.0;
          } else {
@@ -252,21 +256,49 @@ class SequenceStringProjectionOp : public OpKernel {
  }

 private:
+  // Objects used for random number generator.
+  tensorflow::random::PhiloxRandom philox_;
+  tensorflow::random::SimplePhilox generator_;
+
+  // Dimensionality of the ternary vector for each token in the text.
  int32 feature_size_;
+  // An object used to hash tokens in the text.
  std::unique_ptr<Hasher> hasher_;
+  // An object used for distorting text before projection.
  std::unique_ptr<TextDistorter> text_distorter_;
+  // An object used for manipulating unicode in the text. It performs tasks such
+  // as retaining only whitelisted unicodes in the text tokens and lowercasing
+  // them.
  std::unique_ptr<ProjectionUnicodeHandler> unicode_handler_;
+  // An object used for normalizing tokens in the text. This performs tasks
+  // such as identifying repeated characters and replace them with a single
+  // instance.
  std::unique_ptr<ProjectionNormalizer> projection_normalizer_;
+  // Character whitelist used by the projection operator.
  std::string vocabulary_;
+  // Size of the character whitelist.
  int vocabulary_size_;
+  // Maximum number of splits allowed in the text. The number of tokens in text
+  // post segmentation will be utmost max_splits_ + 1.
  int32 max_splits_;
+  // A flag that indicates how to segment text. When true text is segmented by
+  // space. Otherwise it is segmented on unicode boundaries.
  bool split_on_space_;
+  // When true include an end of sentence token in the projection.
  int eos_tag_;
+  // When true include a begin of sentence token in the projection.
  int bos_tag_;
+  // Number of bits used to capture word novelty. See tensorflow op
+  // documentation below for details.
  int word_novelty_bits_;
+  // Number of levels used to capture document size. See tensorflow op
+  // documentation below for details.
  int doc_size_levels_;
+  // Distance between levels used for word novelty.
  float word_novelty_offset_;
+  // Adds boolean feature to indicate first_cap text with the below probability.
  float add_first_cap_feature_;
+  // Adds boolean feature to indicate all_cap text with the below probability.
  float add_all_caps_feature_;
 };


--- a/research/seq_flow_lite/tf_ops/sequence_string_projection_op_v2.cc
+++ b/research/seq_flow_lite/tf_ops/sequence_string_projection_op_v2.cc
@@ -40,6 +40,8 @@ constexpr char kEndTokenTSP[] = "<EOS>";
 constexpr float kMappingTable[4] = {0, 1, -1, 0};
 constexpr int kIncrement = 32;

+// Version 2 OpKernel for the sequence string projection op.
+// Template T can be int32 or int64.
 template <typename T>
 class SequenceStringProjectionOpV2 : public OpKernel {
 public:
@@ -136,7 +138,7 @@ class SequenceStringProjectionOpV2 : public OpKernel {
        } else {
          word = kEndTokenTSP;
        }
-        hasher_->GetHashCodes(word, &hash_codes);
+        hasher_->GetHashCodes(word, hash_codes);
        for (int hindex = 0, k = 0; hindex < hash_codes.size(); hindex++) {
          auto hash = hash_codes[hindex];
          for (int kmax = std::min(k + kIncrement, feature_size_); k < kmax;) {
@@ -153,13 +155,25 @@ class SequenceStringProjectionOpV2 : public OpKernel {
  }

 private:
+  // Dimensionality of the ternary vector for each token in the text.
  int32 feature_size_;
+  // An object used to hash tokens in the text.
  std::unique_ptr<Hasher> hasher_;
+  // An object used for distorting text before projection.
  std::unique_ptr<TextDistorter> text_distorter_;
+  // An object used for manipulating unicode in the text. It performs tasks such
+  // as retaining only whitelisted unicodes in the text tokens and lowercasing
+  // them.
  std::unique_ptr<ProjectionUnicodeHandler> unicode_handler_;
+  // An object used for normalizing tokens in the text. This performs tasks
+  // such as identifying repeated characters and replace them with a single
+  // instance.
  std::unique_ptr<ProjectionNormalizer> projection_normalizer_;
+  // Character whitelist used by the projection operator.
  std::string vocabulary_;
+  // When true include an end of sentence token in the projection.
  int eos_tag_;
+  // When true include a begin of sentence token in the projection.
  int bos_tag_;
 };


--- a/research/seq_flow_lite/tf_ops/text_distorter.h
+++ b/research/seq_flow_lite/tf_ops/text_distorter.h
@@ -32,7 +32,6 @@ class TextDistorter {
    assert(distortion_probability_ <= 1.0);
  }
  std::string DistortText(icu::UnicodeString* uword);
-  bool BernouilleSample(float p) { return (generator_.RandFloat() <= p); }

 private:
  tensorflow::random::PhiloxRandom philox_;

--- a/research/seq_flow_lite/tf_ops/tf_custom_ops.cc
+++ b/research/seq_flow_lite/tf_ops/tf_custom_ops.cc
@@ -20,30 +20,6 @@ limitations under the License.

 using ::tensorflow::int32;

-class PoolingOp : public tensorflow::OpKernel {
- public:
-  explicit PoolingOp(tensorflow::OpKernelConstruction* context)
-      : tensorflow::OpKernel(context) {}
-
-  void Compute(tensorflow::OpKernelContext* ctx) override {}
-};
-
-REGISTER_KERNEL_BUILDER(Name("PoolingOp").Device(::tensorflow::DEVICE_CPU),
-                        PoolingOp);
-
-REGISTER_OP("PoolingOp")
-    .Input("multiplier: float32")
-    .Input("constant: float32")
-    .Input("forward: float32")
-    .Output("state: float32")
-    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(0));
-      return tensorflow::Status::OK();
-    })
-    .Doc(R"doc(
-Dummy pooling op.
-)doc");
-
 class ExpectedValueOp : public tensorflow::OpKernel {
 public:
  explicit ExpectedValueOp(tensorflow::OpKernelConstruction* context)
@@ -93,33 +69,3 @@ REGISTER_OP("LayerNorm")
    .Doc(R"doc(
 Dummy layer norm op.
 )doc");
-
-class UniformCausalAttnOp : public tensorflow::OpKernel {
- public:
-  explicit UniformCausalAttnOp(tensorflow::OpKernelConstruction* context)
-      : tensorflow::OpKernel(context) {}
-
-  void Compute(tensorflow::OpKernelContext* ctx) override {}
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("UniformCausalAttn").Device(::tensorflow::DEVICE_CPU),
-    UniformCausalAttnOp);
-
-REGISTER_OP("UniformCausalAttn")
-    .Input("input: float32")
-    .Input("time_step: int32")
-    .Input("selected_beams: int32")
-    .Attr("feature_size: int")
-    .Attr("beam_size: int")
-    .Output("output: float32")
-    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-      auto batch_size = c->Dim(c->input(0), 0);
-      int32 feature_size;
-      TF_RETURN_IF_ERROR(c->GetAttr("feature_size", &feature_size));
-      c->set_output(0, c->MakeShape({batch_size, 1, feature_size}));
-      return tensorflow::Status::OK();
-    })
-    .Doc(R"doc(
-Dummy uniform causal attn op.
-)doc");
--- a/research/seq_flow_lite/tflite_ops/BUILD
+++ b/research/seq_flow_lite/tflite_ops/BUILD
 # TFLite ops for sequence string projection.
+load("@org_tensorflow//tensorflow:tensorflow.bzl", "pybind_extension")
 load("@org_tensorflow//tensorflow/lite:build_def.bzl", "tflite_copts")

 licenses(["notice"])
@@ -100,3 +101,16 @@ cc_test(
        "@flatbuffers",
    ],
 )
+
+pybind_extension(
+    name = "registerer",
+    srcs = ["registerer.cc"],
+    module_name = "registerer",
+    deps = [
+        ":expected_value",
+        ":layer_norm",
+        ":sequence_string_projection",
+        "@org_tensorflow//tensorflow/lite:framework",
+        "@pybind11",
+    ],
+)
--- a/research/seq_flow_lite/tflite_ops/expected_value.cc
+++ b/research/seq_flow_lite/tflite_ops/expected_value.cc
@@ -18,7 +18,7 @@ limitations under the License.

 #include "tflite_ops/quantization_util.h"  // seq_flow_lite

-namespace tflite {
+namespace seq_flow_lite {
 namespace ops {
 namespace custom {

@@ -156,4 +156,4 @@ TfLiteRegistration* Register_EXPECTED_VALUE() {

 }  // namespace custom
 }  // namespace ops
-}  // namespace tflite
+}  // namespace seq_flow_lite
--- a/research/seq_flow_lite/tflite_ops/expected_value.h
+++ b/research/seq_flow_lite/tflite_ops/expected_value.h
@@ -17,7 +17,7 @@ limitations under the License.

 #include "tensorflow/lite/kernels/register.h"

-namespace tflite {
+namespace seq_flow_lite {
 namespace ops {
 namespace custom {

@@ -25,6 +25,6 @@ TfLiteRegistration* Register_EXPECTED_VALUE();

 }  // namespace custom
 }  // namespace ops
-}  // namespace tflite
+}  // namespace seq_flow_lite

 #endif  // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TFLITE_OPS_EXPECTED_VALUE_H_
--- a/research/seq_flow_lite/tflite_ops/layer_norm.cc
+++ b/research/seq_flow_lite/tflite_ops/layer_norm.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>

-#include "tflite_ops/quantization_util.h"  // seq_flow_lite
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tflite_ops/quantization_util.h"  // seq_flow_lite

-namespace tflite {
+namespace seq_flow_lite {
 namespace ops {
 namespace custom {

@@ -213,6 +213,102 @@ TfLiteStatus FlexibleLayerNorm(const TfLiteTensor* input, const float scale,
  return kTfLiteOk;
 }

+/*
+ * Layer normalization is optimized as follows in integer arithmetic
+ *
+ * Algorithm
+ * *********
+ * Subscript i \in {1, ..., N}, Inputs q_i, Outputs oq_i.
+ *
+ * x_i = (q_i - input_zero_point) * input_scale
+ * mean = sum_i x_i / N
+ * var = sum_i (x_i * x_i / N) - mean * mean
+ * std = sqrt(var + tolerance)
+ * xni = (xi - mean) / std
+ * yi = xni * scale + offset
+ * o_i = round(y_i / output_scale + output_zero_point)
+ * oq_i = clamp(o_i, 0, 255)
+ *
+ * Optimizations
+ * *************
+ * Applying linear expansion
+ * x_i = q_i * input_scale - input_zero_point * input_scale
+ * or x_i = m * qi + c
+ * mean = m * mean_q + c
+ * Variance is not affected by a constant shift to input
+ * var = m^2 * var_q
+ * std = m * sqrt(var_q + tolerance)
+ * Expanding xi, mean, std in equation for xni
+ * xni = (m * qi + c - m * mean_q - c) / m * sqrt(var_q + tolerance)
+ * Simplifying
+ * xni = (qi - mean_q) / sqrt(var_q + tolerance)
+ * Setting inv_std_qi = 1 / sqrt(var_q + tolerance)
+ * xni = qi * inv_std_qi - mean_q * inv_std_qi
+ * yi = qi * inv_std_qi * scale - mean_q * inv_std_qi * scale + offset
+ * o_i = round(qi * inv_std_qi * scale / output_scale
+ *             - mean_q * inv_std_qi * scale / output_scale
+ *             + offset / output_scale
+ *             + output_zero_point)
+ * Setting
+ * static_bias = offset / output_scale + output_zero_point
+ * static_scale = scale / output_scale
+ * o_i = round(qi * inv_std_qi * static_scale
+ *             - mean_q * inv_std_qi * static_scale
+ *             + static_bias)
+ * Setting
+ * dynamic_scale = inv_std_qi * static_scale
+ * dynamic_bias = static_bias - mean_q * dynamic_scale
+ * o_i = round(qi * dynamic_scale + dynamic_bias)
+ * oq_i = clamp(round(qi * dynamic_scale + dynamic_bias), 0, 255)
+ *
+ * This results in the below optimized implementation. The strategy is to first
+ * compute first and second order summary statistics for qi in a loop,
+ * then compute mean_q, var_q and then dynamic_scale/dynamic_bias. This
+ * allows one to compute oqi quickly in a tight loop.
+ * */
+TfLiteStatus IntegerLayerNorm(const TfLiteTensor* input, const float scale,
+                              const float offset, TfLiteTensor* output) {
+  const int input_rank = input->dims->size;
+  const int num_features = input->dims->data[input_rank - 1];
+  const int time_steps =
+      static_cast<int>(GetNumberOfSteps(input) / num_features);
+
+  const float out_inverse_scale = 1.0f / output->params.scale;
+  const float static_scale = scale * out_inverse_scale;
+  const float static_bias = static_cast<float>(output->params.zero_point) +
+                            offset * out_inverse_scale;
+  const float inverse_num_features = 1.0f / num_features;
+  const uint8_t* const in_ptr = input->data.uint8;
+  uint8_t* out_ptr = output->data.uint8;
+  for (int i = 0; i < time_steps; ++i) {
+    int32_t i32_sum_q = 0;
+    int32_t i32_sum_qq = 0;
+    const int32_t index = i * num_features;
+    for (int j = index; j < index + num_features; ++j) {
+      const int32_t q_i = static_cast<int32_t>(in_ptr[j]);
+      // Compute first and second order statistics for qi.
+      i32_sum_q += q_i;
+      i32_sum_qq += q_i * q_i;
+    }
+    const float second_moment_qq = i32_sum_qq * inverse_num_features;
+    const float mean_q = i32_sum_q * inverse_num_features;
+    const float var_q = second_moment_qq - mean_q * mean_q;
+    const float inv_std_q = 1.0f / sqrt(var_q + 1e-6);
+    const float dynamic_scale = inv_std_q * static_scale;
+    const float dynamic_bias = static_bias - mean_q * dynamic_scale;
+    for (int j = index; j < index + num_features; ++j) {
+      const int32_t invalue = static_cast<int32_t>(in_ptr[j]);
+      const float value = invalue * dynamic_scale + dynamic_bias;
+      // Use an offseted cast to perform float round.
+      const int32_t i32value =
+          static_cast<int32_t>(value + ((value >= 0.0) ? 0.5f : -0.5f));
+      // Clamp the result.
+      out_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, i32value), 0));
+    }
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus DefaultLayerNormFloat(const TfLiteTensor* input, const float scale,
                                   const float offset, TfLiteTensor* output) {
  const int input_rank = input->dims->size;
@@ -298,7 +394,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
  if (num_axis == 1 && (axis->data.i32[0] == -1 ||
                        axis->data.i32[0] == (input->dims->size - 1))) {
    if (input->type == kTfLiteUInt8) {
-      return DefaultLayerNorm(input, scale, offset, output);
+      return IntegerLayerNorm(input, scale, offset, output);
    } else if (input->type == kTfLiteFloat32) {
      return DefaultLayerNormFloat(input, scale, offset, output);
    } else {
@@ -328,4 +424,4 @@ TfLiteRegistration* Register_LAYER_NORM() {

 }  // namespace custom
 }  // namespace ops
-}  // namespace tflite
+}  // namespace seq_flow_lite
--- a/research/seq_flow_lite/tflite_ops/layer_norm.h
+++ b/research/seq_flow_lite/tflite_ops/layer_norm.h
@@ -17,7 +17,7 @@ limitations under the License.

 #include "tensorflow/lite/kernels/register.h"

-namespace tflite {
+namespace seq_flow_lite {
 namespace ops {
 namespace custom {

@@ -25,6 +25,6 @@ TfLiteRegistration* Register_LAYER_NORM();

 }  // namespace custom
 }  // namespace ops
-}  // namespace tflite
+}  // namespace seq_flow_lite

 #endif  // LEARNING_EXPANDER_POD_DEEP_POD_TFLITE_HANDLERS_LAYER_NORM_H_
--- a/research/seq_flow_lite/tflite_ops/layer_norm_test.cc
+++ b/research/seq_flow_lite/tflite_ops/layer_norm_test.cc
@@ -20,40 +20,35 @@ limitations under the License.
 #include "flatbuffers/flexbuffers.h"  // flatbuffer
 #include "tensorflow/lite/kernels/test_util.h"

-namespace tflite {
+namespace seq_flow_lite {
 namespace ops {
 namespace custom {
 namespace {

-class LayerNormModel : public SingleOpModel {
+using ::testing::ElementsAreArray;
+using ::tflite::ArrayFloatNear;
+using ::tflite::Dequantize;
+using ::tflite::TensorType_INT32;
+using ::tflite::TensorType_UINT8;
+
+class LayerNormModel : public ::tflite::SingleOpModel {
 public:
-  explicit LayerNormModel(const TensorData& input, float output_min,
+  explicit LayerNormModel(std::initializer_list<int> input_shape,
+                          float input_min, float input_max, float output_min,
                          float output_max, float scale, float offset,
-                          std::initializer_list<int> axis_shape,
-                          std::initializer_list<int> axis)
+                          std::initializer_list<int> axes)
      : scale_value_(scale), offset_value_(offset) {
-    input_ = AddInput(input);
+    const int num_axes = axes.size();
+    input_ = AddInput({TensorType_UINT8, input_shape, input_min, input_max});
    scale_ = AddInput(
        {TensorType_UINT8, {1}, std::min(scale, 0.0f), std::max(scale, 0.0f)});
    offset_ = AddInput({TensorType_UINT8,
                        {1},
                        std::min(offset, 0.0f),
                        std::max(offset, 0.0f)});
-    axis_ = AddConstInput(TensorType_INT32, axis, axis_shape);
+    axis_ = AddConstInput(TensorType_INT32, axes, {num_axes});
    output_ = AddOutput({TensorType_UINT8, {}, output_min, output_max});
-    flexbuffers::Builder fbb;
-    fbb.Map([&] {
-      {
-        size_t start = fbb.StartVector("axes");
-        for (const int& aval : axis) {
-          fbb.Int(aval);
-        }
-        fbb.EndVector(start, /*typed=*/true, /*fixed=*/false);
-      }
-    });
-    fbb.Finish();
-
-    SetCustomOp("LayerNorm", fbb.GetBuffer(), Register_LAYER_NORM);
+    SetCustomOp("LayerNorm", {}, Register_LAYER_NORM);
    BuildInterpreter({GetShape(input_)});
  }

@@ -88,8 +83,9 @@ TEST(LayerNormModelTest, RegularInput) {
  const std::vector<float> expected_output = {0.0, -1.6,  0.53, 1.07,
                                              0.0, -1.13, 1.59, -0.45};

-  LayerNormModel m({TensorType_UINT8, {1, 2, 4}, -10, 10}, -10, 10, 1.0, 0.0,
-                   {1}, {2});
+  LayerNormModel m(/*input_shape=*/{1, 2, 4}, /*input_min=*/-10,
+                   /*input_max=*/10, /*output_min=*/-10, /*output_max=*/10,
+                   /*scale=*/1.0, /*offset=*/0.0, /*axes=*/{2});
  m.SetInput(input);
  m.Invoke();
  EXPECT_THAT(
@@ -106,8 +102,9 @@ TEST(LayerNormModelTest, NegativeScale) {
  // Standard deviation values are 3.74, 4.41
  const std::vector<float> expected_output = {0.0, 1.6,  -0.53, -1.07,
                                              0.0, 1.13, -1.59, 0.45};
-  LayerNormModel m({TensorType_UINT8, {1, 2, 4}, -10, 10}, -10, 10, -1.0, 0.0,
-                   {1}, {2});
+  LayerNormModel m(/*input_shape=*/{1, 2, 4}, /*input_min=*/-10,
+                   /*input_max=*/10, /*output_min=*/-10, /*output_max=*/10,
+                   /*scale=*/-1.0, /*offset=*/0.0, /*axes=*/{2});
  m.SetInput(input);
  m.Invoke();
  EXPECT_THAT(
@@ -124,8 +121,9 @@ TEST(LayerNormModelTest, NegativeOffset) {
  // Standard deviation values are 3.74, 4.41
  const std::vector<float> expected_output = {-1.0, -2.6,  -0.53, 0.07,
                                              -1.0, -2.13, 0.59,  -1.45};
-  LayerNormModel m({TensorType_UINT8, {1, 2, 4}, -10, 10}, -10, 10, 1.0, -1.0,
-                   {1}, {2});
+  LayerNormModel m(/*input_shape=*/{1, 2, 4}, /*input_min=*/-10,
+                   /*input_max=*/10, /*output_min=*/-10, /*output_max=*/10,
+                   /*scale=*/1.0, /*offset=*/-1.0, /*axes=*/{2});
  m.SetInput(input);
  m.Invoke();
  EXPECT_THAT(
@@ -142,8 +140,9 @@ TEST(LayerNormModelTest, NegativeScaleAndOffset) {
  // Standard deviation values are 3.74, 4.41
  const std::vector<float> expected_output = {-1.0, 0.6,  -1.53, -2.07,
                                              -1.0, 0.13, -2.59, -0.55};
-  LayerNormModel m({TensorType_UINT8, {1, 2, 4}, -10, 10}, -10, 10, -1.0, -1.0,
-                   {1}, {2});
+  LayerNormModel m(/*input_shape=*/{1, 2, 4}, /*input_min=*/-10,
+                   /*input_max=*/10, /*output_min=*/-10, /*output_max=*/10,
+                   /*scale=*/-1.0, /*offset=*/-1.0, /*axes=*/{2});
  m.SetInput(input);
  m.Invoke();
  EXPECT_THAT(
@@ -160,8 +159,9 @@ TEST(LayerNormModelTest, MultipleAxis) {
      1.12,  -2.08, 0.48,  -0.16, -0.95, -1.46, -0.95, 0.06,
      -0.69, -0.23, -1.60, -1.15, -0.80, -0.16, 0.48,  1.12};

-  LayerNormModel m({TensorType_UINT8, {1, 2, 3, 4}, -3, 3}, -3, 3, 1.0, 0.0,
-                   {2}, {1, 3});
+  LayerNormModel m(/*input_shape=*/{1, 2, 3, 4}, /*input_min=*/-3,
+                   /*input_max=*/3, /*output_min=*/-3, /*output_max=*/3,
+                   /*scale=*/1.0, /*offset=*/0.0, /*axes=*/{1, 3});
  m.SetInput(input);
  m.Invoke();
  EXPECT_THAT(
@@ -178,8 +178,9 @@ TEST(LayerNormModelTest, MultipleNegativeAxis) {
      1.12,  -2.08, 0.48,  -0.16, -0.95, -1.46, -0.95, 0.06,
      -0.69, -0.23, -1.60, -1.15, -0.80, -0.16, 0.48,  1.12};

-  LayerNormModel m({TensorType_UINT8, {1, 2, 3, 4}, -3, 3}, -3, 3, 1.0, 0.0,
-                   {2}, {-3, -1});
+  LayerNormModel m(/*input_shape=*/{1, 2, 3, 4}, /*input_min=*/-3,
+                   /*input_max=*/3, /*output_min=*/-3, /*output_max=*/3,
+                   /*scale=*/1.0, /*offset=*/0.0, /*axes=*/{-3, -1});
  m.SetInput(input);
  m.Invoke();
  EXPECT_THAT(
@@ -199,8 +200,9 @@ TEST(LayerNormModelTest, MultipleAxisWithLargeDepth) {
      2.05,  2.05,  -0.67, -0.28, 1.27,  1.27,  -1.06, -1.06, -0.28,
      0.,    -0.85, -0.42, 0.,    0.42,  -0.85, -0.42, 0.,    0.42};

-  LayerNormModel m({TensorType_UINT8, {1, 2, 2, 9}, -1.0, 1.0}, -3.0, 3.0, 1.0,
-                   0.0, {2}, {1, 3});
+  LayerNormModel m(/*input_shape=*/{1, 2, 2, 9}, /*input_min=*/-1.0,
+                   /*input_max=*/1.0, /*output_min=*/-3.0, /*output_max=*/3.0,
+                   /*scale=*/1.0, /*offset=*/0.0, /*axes=*/{1, 3});
  m.SetInput(input);
  m.Invoke();
  EXPECT_THAT(
@@ -211,4 +213,4 @@ TEST(LayerNormModelTest, MultipleAxisWithLargeDepth) {
 }  // namespace
 }  // namespace custom
 }  // namespace ops
-}  // namespace tflite
+}  // namespace seq_flow_lite
--- a/research/seq_flow_lite/tflite_ops/quantization_util.h
+++ b/research/seq_flow_lite/tflite_ops/quantization_util.h
@@ -20,7 +20,7 @@ limitations under the License.

 #include "tensorflow/lite/context.h"

-namespace tflite {
+namespace seq_flow_lite {

 // Returns the original (dequantized) value of 8bit value.
 inline float PodDequantizeValue(const TfLiteTensor& tensor, uint8_t value) {
@@ -48,6 +48,6 @@ inline uint8_t PodQuantize(float value, int32_t zero_point,
  return static_cast<uint8_t>(std::max(std::min(255, integer_value), 0));
 }

-}  // namespace tflite
+}  // namespace seq_flow_lite

 #endif  // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TFLITE_OPS_QUANTIZATION_UTIL_H_
--- a/research/seq_flow_lite/tflite_ops/registerer.cc
+++ b/research/seq_flow_lite/tflite_ops/registerer.cc
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "tflite_ops/expected_value.h"  // seq_flow_lite
+#include "tflite_ops/layer_norm.h"  // seq_flow_lite
+#include "tflite_ops/sequence_string_projection.h"  // seq_flow_lite
+
+PYBIND11_MODULE(registerer, m) {
+  m.doc() =
+      "Module that provides a registerer from the seq flow lite custom ops";
+  m.def(
+      "RegisterCustomOps",
+      [](uintptr_t ptr) {
+        ::tflite::MutableOpResolver* resolver =
+            reinterpret_cast<::tflite::MutableOpResolver*>(ptr);
+        resolver->AddCustom(
+            "ExpectedValueOp",
+            ::seq_flow_lite::ops::custom::Register_EXPECTED_VALUE());
+        resolver->AddCustom(
+            "LayerNorm", ::seq_flow_lite::ops::custom::Register_LAYER_NORM());
+        resolver->AddCustom("SEQUENCE_STRING_PROJECTION",
+                            ::seq_flow_lite::ops::custom::
+                                Register_SEQUENCE_STRING_PROJECTION());
+        resolver->AddCustom("SequenceStringProjection",
+                            ::seq_flow_lite::ops::custom::
+                                Register_SEQUENCE_STRING_PROJECTION());
+        resolver->AddCustom("SEQUENCE_STRING_PROJECTION_V2",
+                            ::seq_flow_lite::ops::custom::
+                                Register_SEQUENCE_STRING_PROJECTION());
+        resolver->AddCustom("SequenceStringProjectionV2",
+                            ::seq_flow_lite::ops::custom::
+                                Register_SEQUENCE_STRING_PROJECTION_V2());
+      },
+      "Register custom ops used by seq flow lite layers");
+}
--- a/research/seq_flow_lite/tflite_ops/sequence_string_projection.cc
+++ b/research/seq_flow_lite/tflite_ops/sequence_string_projection.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tf_ops/projection_util.h"  // seq_flow_lite
 #include "tflite_ops/quantization_util.h"  // seq_flow_lite

-namespace tflite {
+namespace seq_flow_lite {
 namespace ops {
 namespace custom {

@@ -163,7 +163,7 @@ class ProjectionParams {
    DocSizeFeature(&doc_size_feature, num_tokens);
    *data = PodQuantize(doc_size_feature, 127.0f, 127);
  }
-  void Hash(const std::string& word, std::vector<uint64_t>* hash_codes) {
+  void Hash(const std::string& word, std::vector<uint64_t>& hash_codes) {
    hasher_->GetHashCodes(word, hash_codes);
  }
  // Lower cases the input text and eliminates all unsupported
@@ -269,6 +269,8 @@ class ProjectionParamsV2 : public ProjectionParams {
                           num_tokens, dims->data[1]);
      return kTfLiteError;
    }
+    tokens_.clear();
+    tokens_.reserve(num_tokens);
    for (int i = 0; i < num_tokens; ++i) {
      const tflite::StringRef strref = tflite::GetString(input_t, i);
      tokens_.push_back(std::pair<const char*, size_t>(strref.str, strref.len));
@@ -412,7 +414,7 @@ void TypedEval(const T* mapping_table, ProjectionParams* params, T* data) {
    } else {
      word = kEndToken;
    }
-    params->Hash(word, &hash_codes);
+    params->Hash(word, hash_codes);
    for (int hindex = 0, k = 0; hindex < hash_codes.size(); hindex++) {
      auto hash = hash_codes[hindex];
      for (int kmax = std::min(k + kIncrement, params->FeatureSize());
@@ -505,4 +507,4 @@ TfLiteRegistration* Register_SEQUENCE_STRING_PROJECTION_V2() {

 }  // namespace custom
 }  // namespace ops
-}  // namespace tflite
+}  // namespace seq_flow_lite
--- a/research/seq_flow_lite/tflite_ops/sequence_string_projection.h
+++ b/research/seq_flow_lite/tflite_ops/sequence_string_projection.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TFLITE_OPS_SEQUENCE_STRING_PROJECTION_H_
 #include "tensorflow/lite/kernels/register.h"

-namespace tflite {
+namespace seq_flow_lite {
 namespace ops {
 namespace custom {

@@ -29,6 +29,6 @@ extern const char kSequenceStringProjectionV2[];
 TfLiteRegistration* Register_SEQUENCE_STRING_PROJECTION_V2();
 }  // namespace custom
 }  // namespace ops
-}  // namespace tflite
+}  // namespace seq_flow_lite

 #endif  // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TFLITE_OPS_SEQUENCE_STRING_PROJECTION_H_
--- a/research/seq_flow_lite/tflite_ops/sequence_string_projection_test.cc
+++ b/research/seq_flow_lite/tflite_ops/sequence_string_projection_test.cc
@@ -25,29 +25,32 @@ limitations under the License.
 #include "tf_ops/projection_util.h"  // seq_flow_lite
 #include "tflite_ops/tf_tflite_diff_test_util.h"  // seq_flow_lite

-namespace tflite {
+namespace seq_flow_lite {

 namespace ops {
 namespace custom {

 namespace {

+using ::seq_flow_lite::testing::AttrValue;
+using ::seq_flow_lite::testing::FloatTensor;
+using ::seq_flow_lite::testing::IntTensor;
+using ::seq_flow_lite::testing::OpEquivTestCase;
+using ::seq_flow_lite::testing::StringTensor;
+using ::seq_flow_lite::testing::TensorflowTfLiteOpTest;
 using ::testing::ElementsAreArray;
-using ::tflite::testing::AttrValue;
-using ::tflite::testing::FloatTensor;
-using ::tflite::testing::IntTensor;
-using ::tflite::testing::OpEquivTestCase;
-using ::tflite::testing::StringTensor;
-using ::tflite::testing::TensorflowTfLiteOpTest;
-
-class SequenceStringProjectionModel : public SingleOpModel {
+using ::tflite::TensorType_FLOAT32;
+using ::tflite::TensorType_STRING;
+using ::tflite::TensorType_UINT8;
+
+class SequenceStringProjectionModel : public ::tflite::SingleOpModel {
 public:
  explicit SequenceStringProjectionModel(
      bool split_on_space, int max_splits, int word_novelty_bits,
-      int doc_size_levels, bool add_eos_tag, TensorType output_type,
+      int doc_size_levels, bool add_eos_tag, ::tflite::TensorType output_type,
      const std::string& token_separators = "",
      bool normalize_repetition = false, float add_first_cap = 0.0,
-      float add_all_caps = 0.0, const string& hashtype = kMurmurHash) {
+      float add_all_caps = 0.0, const std::string& hashtype = kMurmurHash) {
    flexbuffers::Builder fbb;
    fbb.Map([&] {
      fbb.Int("feature_size", 4);
@@ -798,11 +801,11 @@ INSTANTIATE_TEST_SUITE_P(
    SequenceStringProjectionTests, SequenceStringProjectionTest,
    ::testing::ValuesIn(SequenceStringProjectionTestCases()));

-class SequenceStringProjectionV2Model : public SingleOpModel {
+class SequenceStringProjectionV2Model : public ::tflite::SingleOpModel {
 public:
  explicit SequenceStringProjectionV2Model(
      std::vector<std::vector<int>> input_shapes,
-      const string& hashtype = kMurmurHash) {
+      const std::string& hashtype = kMurmurHash) {
    flexbuffers::Builder fbb;
    fbb.Map([&] {
      fbb.Int("feature_size", 4);
@@ -827,6 +830,7 @@ class SequenceStringProjectionV2Model : public SingleOpModel {
        << "Cannot allocate tensors";
    return SingleOpModel::InvokeUnchecked();
  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }

 private:
  int input_;
@@ -884,6 +888,15 @@ TEST(SequenceStringProjectionV2Test, RegularInputUint8) {
  m.Invoke({"hello", "world"}, kTfLiteOk);
 }

+TEST(SequenceStringProjectionV2Test, NumberProjectionsForMultipleInputs) {
+  SequenceStringProjectionV2Model m({{1, 2}});
+  std::vector<std::string> input = {"hello", "world"};
+  m.Invoke(input, kTfLiteOk);
+  EXPECT_EQ(m.GetOutputShape()[1], input.size());
+  m.Invoke(input, kTfLiteOk);
+  EXPECT_EQ(m.GetOutputShape()[1], input.size());
+}
+
 class SequenceStringProjectionV2Test : public TensorflowTfLiteOpTest {
  std::function<TfLiteRegistration*()> TfLiteOpRegistration() override {
    return ops::custom::Register_SEQUENCE_STRING_PROJECTION_V2;
@@ -986,7 +999,7 @@ INSTANTIATE_TEST_SUITE_P(
 }  // namespace
 }  // namespace custom
 }  // namespace ops
-}  // namespace tflite
+}  // namespace seq_flow_lite

 int main(int argc, char** argv) {
  // On Linux, add: absl::SetFlag(&FLAGS_logtostderr, true);

--- a/research/seq_flow_lite/tflite_ops/tf_tflite_diff_test_util.cc
+++ b/research/seq_flow_lite/tflite_ops/tf_tflite_diff_test_util.cc
@@ -19,11 +19,16 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/core/status_test_util.h"

-namespace tflite {
+namespace seq_flow_lite {
 namespace testing {

 using ::tensorflow::TensorProto;
 using ::testing::FloatNear;
+using ::tflite::TensorType_STRING;
+using ::tflite::TensorType_UINT8;
+using ::tflite::TensorType_INT32;
+using ::tflite::TensorType_BOOL;
+using ::tflite::TensorType_FLOAT32;

 ::tflite::TensorType TfTypeToTfLiteType(::tensorflow::DataType dtype) {
  switch (dtype) {
@@ -324,7 +329,7 @@ void TensorflowTfLiteOpTest::CompareOpOutput() {
        const auto& quantization_params =
            GetParam().output_tensors[i].quantization_params;
        if (quantization_params.scale != 0.0) {
-          auto tflite_output_values = Dequantize(
+          auto tflite_output_values = tflite::Dequantize(
              tflite_op_.ExtractVector<uint8_t>(tflite_outputs_[i]),
              quantization_params.scale, quantization_params.zero_point);
          for (int i = 0; i < tf_output_values.size(); i++) {

--- a/research/seq_flow_lite/tflite_ops/tf_tflite_diff_test_util.h
+++ b/research/seq_flow_lite/tflite_ops/tf_tflite_diff_test_util.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/lite/kernels/test_util.h"

-namespace tflite {
+namespace seq_flow_lite {
 namespace testing {

 // Convenience constructors.
@@ -144,6 +144,6 @@ class TensorflowTfLiteOpTest
 };

 }  // namespace testing
-}  // namespace tflite
+}  // namespace seq_flow_lite

 #endif  // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TFLITE_OPS_TF_TFLITE_DIFF_TEST_UTIL_H_
--- a/research/seq_flow_lite/trainer_v2.py
+++ b/research/seq_flow_lite/trainer_v2.py
+# Copyright 2021 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Lint as: python3
+"""Binary to train PRADO model with TF 2.0."""
+
+import importlib
+import json
+
+from absl import app
+from absl import flags
+from absl import logging
+
+import tensorflow as tf
+
+import input_fn_reader # import root module
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("config_path", None, "Path to a RunnerConfig.")
+flags.DEFINE_enum("runner_mode", "train", ["train", "train_and_eval", "eval"],
+                  "Runner mode.")
+flags.DEFINE_string("master", None, "TensorFlow master URL.")
+flags.DEFINE_string(
+    "output_dir", "/tmp/testV2",
+    "The output directory where the model checkpoints will be written.")
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+flags.DEFINE_integer(
+    "num_tpu_cores", 8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.")
+
+
+def load_runner_config():
+  with tf.io.gfile.GFile(FLAGS.config_path, "r") as f:
+    return json.loads(f.read())
+
+
+def compute_loss(logits, labels, model_config, mode):
+  """Creates a sequence labeling model."""
+  if mode != tf.estimator.ModeKeys.PREDICT:
+    if not model_config["multilabel"]:
+      loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+          labels=labels, logits=logits)
+    else:
+      loss = tf.nn.sigmoid_cross_entropy_with_logits(
+          labels=labels, logits=logits)
+    loss = tf.reduce_mean(loss)
+  else:
+    loss = None
+
+  return loss
+
+
+def model_fn_builder(runner_config, mode):
+  """Returns `model_fn` closure for TPUEstimator."""
+
+  rel_module_path = "" # empty base dir
+  model = importlib.import_module(rel_module_path + runner_config["name"])
+  model_config = runner_config["model_config"]
+  return model.Encoder(model_config, mode)
+
+
+def main(_):
+  runner_config = load_runner_config()
+
+  if FLAGS.output_dir:
+    tf.io.gfile.makedirs(FLAGS.output_dir)
+
+  train_model = model_fn_builder(runner_config, tf.estimator.ModeKeys.TRAIN)
+  optimizer = tf.keras.optimizers.Adam()
+  train_input_fn = input_fn_reader.create_input_fn(
+      runner_config=runner_config,
+      mode=tf.estimator.ModeKeys.TRAIN,
+      drop_remainder=True)
+  params = {"batch_size": runner_config["batch_size"]}
+  train_ds = train_input_fn(params)
+  train_loss = tf.keras.metrics.Mean(name="train_loss")
+
+  @tf.function
+  def train_step(features):
+    with tf.GradientTape() as tape:
+      logits = train_model(features["projection"], features["seq_length"])
+      loss = compute_loss(logits, features["label"],
+                          runner_config["model_config"],
+                          tf.estimator.ModeKeys.TRAIN)
+    gradients = tape.gradient(loss, train_model.trainable_variables)
+    optimizer.apply_gradients(zip(gradients, train_model.trainable_variables))
+    train_loss(loss)
+
+  for epoch in range(1):
+    train_loss.reset_states()
+    for features in train_ds:
+      train_step(features)
+      step = optimizer.iterations.numpy()
+      if step % 100 == 0:
+        logging.info("Running step %s in epoch %s", step, epoch)
+        logging.info("Training loss: %s, epoch: %s, step: %s",
+                     round(train_loss.result().numpy(), 4), epoch, step)
+
+
+if __name__ == "__main__":
+  app.run(main)
--- a/research/seq_flow_lite/utils/tflite_utils.py
+++ b/research/seq_flow_lite/utils/tflite_utils.py
@@ -29,13 +29,49 @@ def _dump_graph_in_text_format(filename, graph_def):


 class InterpreterWithCustomOps(tf.lite.Interpreter):
+  """Extended tf.lite.Interpreter."""

-  def __init__(self, model_content, custom_op_registerers):
-    self._custom_op_registerers = custom_op_registerers
+  def __init__(self, model_content, custom_op_registerers=None):
+    self._custom_op_registerers = custom_op_registerers or []
    super(InterpreterWithCustomOps, self).__init__(model_content=model_content)

+  def op_details(self):
+    op_details = {}
+    try:
+      op_details = self._get_ops_details()  # Accessing experimental method.
+    except AttributeError:
+      print('Unable to access op details')
+    return op_details

-def set_output_quantized_for_custom_ops(graph_def):
+  def op_histogram(self):
+    op_hist = {}
+    op_list = self.op_details()
+    for op in op_list:
+      if op['op_name'] in op_hist:
+        op_hist[op['op_name']] += 1
+      else:
+        op_hist[op['op_name']] = 1
+    return op_hist
+
+  def check_op_histogram(self, expected):
+    passed = True
+    for k, v in self.op_histogram().items():
+      if k not in expected:
+        print('Unexpected key {} found {} times.'.format(k, v))
+        passed = False
+        continue
+      elif expected[k] != v:
+        print('Expected {} counts of key {} found {}.'.format(
+            expected[k], k, v))
+        passed = False
+      del expected[k]
+    for k, v in expected.items():
+      print('Missing expected key {} value {}.'.format(k, v))
+      passed = False
+    return passed
+
+
+def set_output_quantized_for_custom_ops(graph_def, use_mlir=True):
  """Set output types/quantized flag for custom/unsupported ops."""
  quantized_custom_ops = {
      'SequenceStringProjection': [tf.float32.as_datatype_enum],
@@ -44,6 +80,8 @@ def set_output_quantized_for_custom_ops(graph_def):
      'ExpectedValueOp': [tf.float32.as_datatype_enum],
      'LayerNorm': [tf.float32.as_datatype_enum],
      'UniformCausalAttn': [tf.float32.as_datatype_enum],
+      'RnnDecoderReadState': [tf.float32.as_datatype_enum],
+      'RnnDecoderWriteState': [tf.float32.as_datatype_enum],
  }
  custom_op_renames = {
      'SequenceStringProjection': 'SEQUENCE_STRING_PROJECTION',
@@ -52,30 +90,27 @@ def set_output_quantized_for_custom_ops(graph_def):

  for node in graph_def.node:
    if node.op in quantized_custom_ops:
-      node.attr['_output_quantized'].b = True
-      node.attr['_output_types'].list.type[:] = quantized_custom_ops[node.op]
-    if node.op in custom_op_renames:
+      if use_mlir:
+        node.attr['_tfl_quant_trait'].s = str.encode('fully_quantizable')
+      else:
+        node.attr['_output_quantized'].b = True
+        node.attr['_output_types'].list.type[:] = quantized_custom_ops[node.op]
+    if not use_mlir and node.op in custom_op_renames:
      node.op = custom_op_renames[node.op]


-def generate_tflite(session, graph, input_tensors, output_tensors):
+def generate_tflite(session,
+                    graph,
+                    input_tensors,
+                    output_tensors,
+                    use_mlir=True):
  """Generate TFLite model from a session, graph and input/output tensors."""
  output_nodes = [tensor.name.split(':')[0] for tensor in output_tensors]
  graph_def = tf.graph_util.convert_variables_to_constants(
      session, graph.as_graph_def(), output_nodes)

-  set_output_quantized_for_custom_ops(graph_def)
-
-  # TODO(b/171063452): Bug needs to be fixed to handle this correctly.
-  #   def _node_name(tensor):
-  #     return tensor.name.split(':')[0]
+  set_output_quantized_for_custom_ops(graph_def, use_mlir)

-  #   input_arrays_with_shape = [
-  #       (_node_name(tensor), None) for tensor in input_tensors
-  #   ]
-  #   output_arrays = [_node_name(tensor) for tensor in output_tensors]
-  #   converter = tf.lite.TFLiteConverter(graph_def, None, None,
-  #                                      input_arrays_with_shape, output_arrays)
  converter = tf.lite.TFLiteConverter(graph_def, input_tensors, output_tensors)
  converter.inference_type = tf.uint8
  converter.default_ranges_stats = (127.5, 127.5)
@@ -83,5 +118,5 @@ def generate_tflite(session, graph, input_tensors, output_tensors):
      tensor.op.name: (127.5, 127.5) for tensor in input_tensors
  }
  converter.allow_custom_ops = True
-  converter.experimental_new_converter = False
+  converter.experimental_new_converter = use_mlir
  return converter.convert()