diff --git a/README.md b/README.md
index adf3602d892a4fa3cf2c30ccad35f4caccde4c1a..2327f8e92cb28be5807dd868982e185c4ddbdb9e 100644
--- a/README.md
+++ b/README.md
@@ -39,9 +39,10 @@ People who are a little bit adventurous can also try our nightly binaries:
 * [Android](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/lastSuccessfulBuild/artifact/bazel-out/local_linux/bin/tensorflow/examples/android/tensorflow_demo.apk) ([build history](http://ci.tensorflow.org/view/Nightly/job/nightly-matrix-android/TF_BUILD_CONTAINER_TYPE=ANDROID,TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=NO_PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=android-slave/))
 
 #### *Try your first TensorFlow program*
-```python
+```shell
 $ python
-
+```
+```python
 >>> import tensorflow as tf
 >>> hello = tf.constant('Hello, TensorFlow!')
 >>> sess = tf.Session()
diff --git a/tensorflow/contrib/cmake/tf_core_cpu.cmake b/tensorflow/contrib/cmake/tf_core_cpu.cmake
index 374096e94252391a2c93914cacef3e85fc99050a..135c0015366a1c6da11c5f5a94e98f9dbcb22c8f 100644
--- a/tensorflow/contrib/cmake/tf_core_cpu.cmake
+++ b/tensorflow/contrib/cmake/tf_core_cpu.cmake
@@ -4,7 +4,6 @@
 file(GLOB_RECURSE tf_core_cpu_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/*.h"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/*.cc"
-    "${tensorflow_source_dir}/tensorflow/core/client/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/*.h"
     "${tensorflow_source_dir}/tensorflow/core/graph/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/public/*.h"
@@ -18,9 +17,17 @@ file(GLOB_RECURSE tf_core_cpu_exclude_srcs
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu_device_factory.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/direct_session.cc"
     "${tensorflow_source_dir}/tensorflow/core/common_runtime/direct_session.h"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/session.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
 )
 
 list(REMOVE_ITEM tf_core_cpu_srcs ${tf_core_cpu_exclude_srcs}) 
+# We need to include stubs for the GPU tracer, which are in the exclude glob.
+list(APPEND tf_core_cpu_srcs
+     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_tracer.cc"
+     "${tensorflow_source_dir}/tensorflow/core/common_runtime/gpu/gpu_tracer.h"
+)
 
 add_library(tf_core_cpu OBJECT ${tf_core_cpu_srcs})
 
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index 78aa9169ddeeab96cf464f68df0662a587378e2c..3e6ec3c389e4875378bd354906b6e313d56e85a6 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -121,7 +121,10 @@ target_include_directories(tf_protos_cc PUBLIC
 target_link_libraries(tf_protos_cc PUBLIC
     ${PROTOBUF_LIBRARIES}
 )
-
+# C++11
+target_compile_features(tf_protos_cc PRIVATE
+    cxx_rvalue_references
+)
 
 ########################################################
 # tf_core_lib library
@@ -154,11 +157,6 @@ target_include_directories(tf_core_lib PUBLIC
     ${jsoncpp_INCLUDE_DIR}
     ${boringssl_INCLUDE_DIR}
 )
-#target_link_libraries(tf_core_lib
-#    ${CMAKE_THREAD_LIBS_INIT}
-#    ${PROTOBUF_LIBRARIES}
-#    tf_protos_cc
-#)
 target_compile_options(tf_core_lib PRIVATE
     -fno-exceptions
     -DEIGEN_AVOID_STL_ARRAY
@@ -188,6 +186,10 @@ file(GLOB_RECURSE tf_core_framework_srcs
     "${tensorflow_source_dir}/tensorflow/core/framework/*.cc"
     "${tensorflow_source_dir}/tensorflow/core/util/*.h"
     "${tensorflow_source_dir}/tensorflow/core/util/*.cc"
+    "${tensorflow_source_dir}/tensorflow/core/client/tensor_c_api.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/session.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_factory.cc"
+    "${tensorflow_source_dir}/tensorflow/core/common_runtime/session_options.cc"
     "${tensorflow_source_dir}/public/*.h"
 )
 
@@ -204,7 +206,10 @@ file(GLOB_RECURSE tf_core_framework_test_srcs
 
 list(REMOVE_ITEM tf_core_framework_srcs ${tf_core_framework_test_srcs})
 
-add_library(tf_core_framework OBJECT ${tf_core_framework_srcs} ${PROTO_TEXT_HDRS})
+add_library(tf_core_framework OBJECT
+    ${tf_core_framework_srcs}
+    ${PROTO_TEXT_HDRS}
+    ${PROTO_TEXT_SRCS})
 target_include_directories(tf_core_framework PUBLIC
     ${tensorflow_source_dir}
     ${eigen_INCLUDE_DIRS}
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index 5a22d881031f67114a6e0b48f6ed622f3f0e49cc..2fff5c2dd37a52dded6e7505135939c927f5d32b 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -19,7 +19,13 @@ list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_exclude_srcs})
 
 add_library(tf_core_kernels OBJECT ${tf_core_kernels_srcs})
 
-add_dependencies(tf_core_kernels tf_core_cpu farmhash highwayhash)
+add_dependencies(tf_core_kernels
+   tf_core_cpu
+   farmhash
+   highwayhash
+   farmhash_copy_headers_to_destination
+   highwayhash_copy_headers_to_destination
+)
 
 target_include_directories(tf_core_kernels PRIVATE
    ${tensorflow_source_dir}
diff --git a/tensorflow/contrib/cmake/tf_stream_executor.cmake b/tensorflow/contrib/cmake/tf_stream_executor.cmake
index 0bc8dad0ab7d61c6920cf94a805ae9aa687b5e04..e1aa0cd7b5da76cd36d0a044d0adaba94c0a4a2d 100644
--- a/tensorflow/contrib/cmake/tf_stream_executor.cmake
+++ b/tensorflow/contrib/cmake/tf_stream_executor.cmake
@@ -58,6 +58,7 @@ add_library(tf_stream_executor OBJECT ${tf_stream_executor_srcs})
 
 target_include_directories(tf_stream_executor PRIVATE
     ${tensorflow_source_dir}
+    ${eigen_INCLUDE_DIRS}
 )
 add_dependencies(tf_stream_executor
     tf_core_lib
diff --git a/tensorflow/contrib/cmake/tf_tutorials.cmake b/tensorflow/contrib/cmake/tf_tutorials.cmake
index 89511b096d49aa2b6e5f7df9e73b3e7253308641..11dfd4739b4f7b6368f2fcbb0bfb472ad5f6f3b5 100644
--- a/tensorflow/contrib/cmake/tf_tutorials.cmake
+++ b/tensorflow/contrib/cmake/tf_tutorials.cmake
@@ -35,10 +35,13 @@ target_include_directories(tf_tutorials_example_trainer PUBLIC
 
 target_link_libraries(tf_tutorials_example_trainer PUBLIC
     ${CMAKE_THREAD_LIBS_INIT}
-    ${PROTOBUF_LIBRARIES}
+    ${PROTOBUF_STATIC_LIBRARIES}
     tf_protos_cc
     re2_lib
+    ${boringssl_STATIC_LIBRARIES}
+    ${farmhash_STATIC_LIBRARIES}
     ${jpeg_STATIC_LIBRARIES}
+    ${jsoncpp_STATIC_LIBRARIES}
     ${png_STATIC_LIBRARIES}
     ${ZLIB_LIBRARIES}
     ${CMAKE_DL_LIBS}
diff --git a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
index c529a2e1710baf6b89f99268db52664146e4e2be..dc79e7a12a008b7973078d852e152b59b82369b3 100644
--- a/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
+++ b/tensorflow/contrib/ios_examples/camera/CameraExampleViewController.mm
@@ -291,7 +291,7 @@ didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
           in + (in_y * image_width * image_channels) + (in_x * image_channels);
       float *out_pixel = out_row + (x * wanted_channels);
       for (int c = 0; c < wanted_channels; ++c) {
-        out_pixel[c] = (in_pixel[c] / input_std) - input_mean;
+        out_pixel[c] = (in_pixel[c] - input_mean) / input_std;
       }
     }
   }
diff --git a/tensorflow/contrib/ios_examples/simple/RunModelViewController.mm b/tensorflow/contrib/ios_examples/simple/RunModelViewController.mm
index 19f00ad479466488423b9dd2ca17e66f90fef135..2e389b39d4b6a38dff03a24f94ad75e5fb2f7fef 100644
--- a/tensorflow/contrib/ios_examples/simple/RunModelViewController.mm
+++ b/tensorflow/contrib/ios_examples/simple/RunModelViewController.mm
@@ -202,7 +202,7 @@ NSString* RunInferenceOnImage() {
       tensorflow::uint8* in_pixel = in_row + (in_x * image_channels);
       float* out_pixel = out_row + (x * wanted_channels);
       for (int c = 0; c < wanted_channels; ++c) {
-        out_pixel[c] = (in_pixel[c] / input_std) - input_mean;
+        out_pixel[c] = (in_pixel[c] - input_mean) / input_std;
       }
     }
   }
diff --git a/tensorflow/contrib/learn/python/learn/ops/dnn_ops.py b/tensorflow/contrib/learn/python/learn/ops/dnn_ops.py
index c1fba21619e4d4ea871caf19e647bd76712f3347..e2d55ef8092a366a0da625fc17e40c5c0ed00104 100644
--- a/tensorflow/contrib/learn/python/learn/ops/dnn_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/dnn_ops.py
@@ -22,7 +22,9 @@ from __future__ import print_function
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework.python.framework.deprecation import deprecated
 from tensorflow.contrib.learn.python.learn.ops import dropout_ops
+
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.ops import array_ops as array_ops_
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn
@@ -32,6 +34,7 @@ from tensorflow.python.ops import variable_scope as vs
 @deprecated('2016-08-01', 'Please use tf.contrib.layers.stack instead.')
 def dnn(tensor_in, hidden_units, activation=nn.relu, dropout=None):
   """Creates fully connected deep neural network subgraph.
+  This is deprecated. Please use contrib.layers.dnn instead.
 
   Args:
     tensor_in: tensor or placeholder for input features.
@@ -42,6 +45,8 @@ def dnn(tensor_in, hidden_units, activation=nn.relu, dropout=None):
   Returns:
     A tensor which would be a deep neural network.
   """
+  logging.warning("learn.ops.dnn is deprecated, \
+    please use contrib.layers.dnn.")
   with vs.variable_scope('dnn'):
     for i, n_units in enumerate(hidden_units):
       with vs.variable_scope('layer%d' % i):
diff --git a/tensorflow/contrib/learn/python/learn/ops/dropout_ops.py b/tensorflow/contrib/learn/python/learn/ops/dropout_ops.py
index d49a7d0d58880f4401427464968c168603710161..a0153f1ac64034be2ad4cd6b5b3f30289467e142 100644
--- a/tensorflow/contrib/learn/python/learn/ops/dropout_ops.py
+++ b/tensorflow/contrib/learn/python/learn/ops/dropout_ops.py
@@ -1,3 +1,4 @@
+
 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,8 +22,10 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import nn
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import tf_logging as logging
+
+from tensorflow.contrib.layers import dropout as contrib_dropout
 
 # Key to collect dropout probabilities.
 DROPOUTS = "dropouts"
@@ -30,7 +33,8 @@ DROPOUTS = "dropouts"
 
 def dropout(tensor_in, prob, name=None):
   """Adds dropout node and stores probability tensor into graph collection.
-
+  This is deprecated. Please use contrib.layers.dropout instead. 
+  
   Args:
     tensor_in: Input tensor.
     prob: Float or Tensor.
@@ -42,10 +46,12 @@ def dropout(tensor_in, prob, name=None):
   Raises:
     ValueError: If `keep_prob` is not in `(0, 1]`.
   """
+  logging.warning("learn.ops.dropout is deprecated, \
+    please use contrib.layers.dropout.")
   with ops.op_scope([tensor_in], name, "dropout") as name:
     if isinstance(prob, float):
       prob = vs.get_variable("prob", [],
                              initializer=init_ops.constant_initializer(prob),
                              trainable=False)
     ops.add_to_collection(DROPOUTS, prob)
-    return nn.dropout(tensor_in, prob)
+    return contrib_dropout(tensor_in, keep_prob=prob)
diff --git a/tensorflow/contrib/learn/python/learn/ops/tests/dropout_ops_test.py b/tensorflow/contrib/learn/python/learn/ops/tests/dropout_ops_test.py
index 4ce38b49ebaf2c246acac2813c5ecd7e19a0a8ef..e6d3911c01f6d9bc4f11c110bee5ce373becc209 100644
--- a/tensorflow/contrib/learn/python/learn/ops/tests/dropout_ops_test.py
+++ b/tensorflow/contrib/learn/python/learn/ops/tests/dropout_ops_test.py
@@ -29,6 +29,7 @@ class DropoutTest(tf.test.TestCase):
 
   def test_dropout_float(self):
     with self.test_session() as session:
+      tf.add_to_collection("IS_TRAINING", True)
       x = tf.placeholder(tf.float32, [5, 5])
       ops.dropout(x, 0.5)
       probs = tf.get_collection(ops.DROPOUTS)
@@ -38,6 +39,7 @@ class DropoutTest(tf.test.TestCase):
 
   def test_dropout_tensor(self):
     with self.test_session():
+      tf.add_to_collection("IS_TRAINING", True)
       x = tf.placeholder(tf.float32, [5, 5])
       y = tf.get_variable("prob", [], initializer=tf.constant_initializer(0.5))
       ops.dropout(x, y)
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_functions_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_functions_test.py
index 47d835e6ebfc4b6f6a67c8a8b0330d1cf55c7da4..a79bd29b8b14d3c9a68051dc028bfaafaa532566 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_functions_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/feeding_functions_test.py
@@ -20,10 +20,16 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-import pandas as pd
 import tensorflow as tf
 import tensorflow.contrib.learn.python.learn.dataframe.queues.feeding_functions as ff
 
+# pylint: disable=g-import-not-at-top
+try:
+  import pandas as pd
+  HAS_PANDAS = True
+except ImportError:
+  HAS_PANDAS = False
+
 
 def vals_to_list(a):
   return {key: val.tolist() if isinstance(val, np.ndarray) else val
@@ -72,6 +78,8 @@ class _FeedingFunctionsTestCase(tf.test.TestCase):
     self.assertEqual(expected, vals_to_list(actual))
 
   def testPandasFeedFnBatchOne(self):
+    if not HAS_PANDAS:
+      return
     array1 = np.arange(32, 64)
     array2 = np.arange(64, 96)
     df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 128))
@@ -88,6 +96,8 @@ class _FeedingFunctionsTestCase(tf.test.TestCase):
       self.assertEqual(expected, vals_to_list(actual))
 
   def testPandasFeedFnBatchFive(self):
+    if not HAS_PANDAS:
+      return
     array1 = np.arange(32, 64)
     array2 = np.arange(64, 96)
     df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 128))
@@ -105,6 +115,8 @@ class _FeedingFunctionsTestCase(tf.test.TestCase):
     self.assertEqual(expected, vals_to_list(actual))
 
   def testPandasFeedFnBatchOneHundred(self):
+    if not HAS_PANDAS:
+      return
     array1 = np.arange(32, 64)
     array2 = np.arange(64, 96)
     df = pd.DataFrame({"a": array1, "b": array2}, index=np.arange(96, 128))
diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
index 8da96e5444b51392b9c84d770acb7859756bd9f2..c200140d80097ce62e25438206dcf442638eefe8 100644
--- a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
+++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py
@@ -123,7 +123,6 @@ class TensorFlowDataFrameTestCase(tf.test.TestCase):
     """Test construction from Pandas DataFrame."""
     if not HAS_PANDAS:
       return
-
     pandas_df = pd.DataFrame({"sparrow": range(10), "ostrich": 1})
     tensorflow_df = df.TensorFlowDataFrame.from_pandas(pandas_df,
                                                        batch_size=10,
@@ -176,7 +175,6 @@ class TensorFlowDataFrameTestCase(tf.test.TestCase):
   def testFromCSV(self):
     if not HAS_PANDAS:
       return
-
     num_batches = 100
     batch_size = 8
     enqueue_size = 7
@@ -214,6 +212,8 @@ class TensorFlowDataFrameTestCase(tf.test.TestCase):
     self.assertEqual(expected_num_batches, actual_num_batches)
 
   def testFromCSVWithFeatureSpec(self):
+    if not HAS_PANDAS:
+      return
     num_batches = 100
     batch_size = 8
 
diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile
index 5f77ee40e239238f2a5d3a9106f8393a436c7d77..c9b4641afa2b5c98b95bc98f3d4cc3b69ae95e8d 100644
--- a/tensorflow/contrib/makefile/Makefile
+++ b/tensorflow/contrib/makefile/Makefile
@@ -66,7 +66,8 @@ HOST_LIBS := \
 -lstdc++ \
 -lprotobuf \
 -lpthread \
--lm
+-lm \
+-lz
 
 # If we're on Linux, also link in the dl library.
 ifeq ($(HOST_OS),LINUX)
@@ -115,7 +116,7 @@ PROTOGENDIR := $(GENDIR)proto/
 # Settings for the target compiler.
 CXX := $(CC_PREFIX) gcc
 OPTFLAGS := -O0
-CXXFLAGS := --std=c++11 $(OPTFLAGS)
+CXXFLAGS := --std=c++11 -DIS_SLIM_BUILD $(OPTFLAGS)
 LDFLAGS := \
 -L/usr/local/lib
 
@@ -367,6 +368,7 @@ TF_CC_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_cc_files.txt)
 PBT_CC_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_pb_text_files.txt)
 PROTO_SRCS := $(shell cat $(MAKEFILE_DIR)/tf_proto_files.txt)
 BENCHMARK_SRCS := \
+tensorflow/core/util/reporter.cc \
 tensorflow/tools/benchmark/benchmark_model.cc \
 tensorflow/tools/benchmark/benchmark_model_main.cc
 
diff --git a/tensorflow/contrib/makefile/gen_file_lists.sh b/tensorflow/contrib/makefile/gen_file_lists.sh
index 71a0d8d618430dc72e54779e5728ee2da02b6ede..2bbc6bfcaeebffa0905c9956a160453e8bb1cb7b 100755
--- a/tensorflow/contrib/makefile/gen_file_lists.sh
+++ b/tensorflow/contrib/makefile/gen_file_lists.sh
@@ -21,21 +21,22 @@ grep "//tensorflow/.*\.cc$" | \
 grep -v "gen_proto_text" | \
 grep -E -v "jpeg" | \
 grep -E -v "png" | \
+grep -E -v "zlib" | \
 sed -E 's#^//##g' | \
 sed -E 's#:#/#g' \
-> make/tf_cc_files.txt
+> tensorflow/contrib/makefile/tf_cc_files.txt
 
 bazel query 'kind("source file", deps(//tensorflow/core:android_tensorflow_lib))' | \
 grep "//tensorflow/.*\.proto$" | \
 sed -E 's#^//##g' | \
 sed -E 's#:#/#g' \
-> make/tf_proto_files.txt
+> tensorflow/contrib/makefile/tf_proto_files.txt
 
 bazel query 'kind("generated file", deps(//tensorflow/core:proto_text))' | \
 grep "pb_text\.cc$" | \
 sed -E 's#^//##g' | \
 sed -E 's#:#/#g' \
-> make/tf_pb_text_files.txt
+> tensorflow/contrib/makefile/tf_pb_text_files.txt
 
 bazel query 'kind("source file", deps(//tensorflow/tools/proto_text:gen_proto_text_functions))' | \
 grep -E "//tensorflow/.*\.cc$" | \
@@ -43,16 +44,16 @@ grep -E -v "jpeg" | \
 grep -E -v "png" | \
 sed -E 's#^//##g' | \
 sed -E 's#:#/#g' \
-> make/proto_text_cc_files.txt
+> tensorflow/contrib/makefile/proto_text_cc_files.txt
 
 bazel query 'kind("generated file", deps(//tensorflow/tools/proto_text:gen_proto_text_functions))' | \
 grep -E "//tensorflow/.*\.cc$" | \
 sed -E 's#^//##g' | \
 sed -E 's#:#/#g' \
-> make/proto_text_pb_cc_files.txt
+> tensorflow/contrib/makefile/proto_text_pb_cc_files.txt
 
 bazel query 'kind("generated file", deps(//tensorflow/tools/proto_text:gen_proto_text_functions))' | \
 grep -E "//tensorflow/.*\.h$" | \
 sed -E 's#^//##g' | \
 sed -E 's#:#/#g' \
-> make/proto_text_pb_h_files.txt
+> tensorflow/contrib/makefile/proto_text_pb_h_files.txt
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index 5dc57a0484793f0ebf91ff81aa4dc6380c53c7dc..1809a7a69bae638f3600989002ab220478e281e6 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -24,6 +24,8 @@ tensorflow/core/lib/random/weighted_picker.cc
 tensorflow/core/lib/random/simple_philox.cc
 tensorflow/core/lib/random/random.cc
 tensorflow/core/lib/random/distribution_sampler.cc
+tensorflow/core/lib/io/zlib_outputbuffer.cc
+tensorflow/core/lib/io/zlib_inputbuffer.cc
 tensorflow/core/lib/io/two_level_iterator.cc
 tensorflow/core/lib/io/table_builder.cc
 tensorflow/core/lib/io/table.cc
diff --git a/tensorflow/contrib/makefile/tf_cc_files.txt b/tensorflow/contrib/makefile/tf_cc_files.txt
index 220d409d30e1e08cfe5606bcc387f19f93a8575b..5402642f5b0c42aa427adccab2738afdabca985b 100644
--- a/tensorflow/contrib/makefile/tf_cc_files.txt
+++ b/tensorflow/contrib/makefile/tf_cc_files.txt
@@ -7,6 +7,7 @@ tensorflow/core/kernels/transpose_functor_cpu.cc
 tensorflow/core/kernels/training_ops.cc
 tensorflow/core/kernels/topk_op.cc
 tensorflow/core/kernels/tile_ops.cc
+tensorflow/core/kernels/strided_slice_op.cc
 tensorflow/core/kernels/stack_ops.cc
 tensorflow/core/kernels/split_op.cc
 tensorflow/core/kernels/split_lib_cpu.cc
@@ -25,6 +26,7 @@ tensorflow/core/kernels/reverse_sequence_op.cc
 tensorflow/core/kernels/reverse_op.cc
 tensorflow/core/kernels/restore_op.cc
 tensorflow/core/kernels/resize_nearest_neighbor_op.cc
+tensorflow/core/kernels/resize_bilinear_op.cc
 tensorflow/core/kernels/reshape_op.cc
 tensorflow/core/kernels/relu_op.cc
 tensorflow/core/kernels/reduction_ops_sum.cc
@@ -52,6 +54,7 @@ tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/cwise_ops_common.cc
 tensorflow/core/kernels/cwise_op_tanh.cc
 tensorflow/core/kernels/cwise_op_sub.cc
+tensorflow/core/kernels/cwise_op_squared_difference.cc
 tensorflow/core/kernels/cwise_op_square.cc
 tensorflow/core/kernels/cwise_op_sqrt.cc
 tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -64,6 +67,7 @@ tensorflow/core/kernels/cwise_op_maximum.cc
 tensorflow/core/kernels/cwise_op_log.cc
 tensorflow/core/kernels/cwise_op_less.cc
 tensorflow/core/kernels/cwise_op_isfinite.cc
+tensorflow/core/kernels/cwise_op_inverse.cc
 tensorflow/core/kernels/cwise_op_greater.cc
 tensorflow/core/kernels/cwise_op_exp.cc
 tensorflow/core/kernels/cwise_op_equal_to.cc
@@ -71,6 +75,7 @@ tensorflow/core/kernels/cwise_op_div.cc
 tensorflow/core/kernels/cwise_op_add.cc
 tensorflow/core/kernels/ctc_decoder_ops.cc
 tensorflow/core/kernels/conv_ops.cc
+tensorflow/core/kernels/conv_grad_ops.cc
 tensorflow/core/kernels/control_flow_ops.cc
 tensorflow/core/kernels/constant_op.cc
 tensorflow/core/kernels/concat_op.cc
@@ -94,7 +99,6 @@ tensorflow/core/util/tensor_format.cc
 tensorflow/core/util/stat_summarizer.cc
 tensorflow/core/util/sparse/group_iterator.cc
 tensorflow/core/util/saved_tensor_slice_util.cc
-tensorflow/core/util/reporter.cc
 tensorflow/core/util/port.cc
 tensorflow/core/util/padding.cc
 tensorflow/core/util/mirror_pad_mode.cc
@@ -179,6 +183,7 @@ tensorflow/core/lib/core/arena.cc
 tensorflow/core/graph/validate.cc
 tensorflow/core/graph/tensor_id.cc
 tensorflow/core/graph/subgraph.cc
+tensorflow/core/graph/quantize_training.cc
 tensorflow/core/graph/optimizer_cse.cc
 tensorflow/core/graph/node_builder.cc
 tensorflow/core/graph/graph_partition.cc
@@ -200,6 +205,7 @@ tensorflow/core/framework/tensor_slice.cc
 tensorflow/core/framework/tensor_shape.cc
 tensorflow/core/framework/tensor_reference.cc
 tensorflow/core/framework/tensor.cc
+tensorflow/core/framework/shape_inference.cc
 tensorflow/core/framework/resource_mgr.cc
 tensorflow/core/framework/rendezvous.cc
 tensorflow/core/framework/reader_op_kernel.cc
diff --git a/tensorflow/contrib/pi_examples/README.md b/tensorflow/contrib/pi_examples/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8dde63e4c6cea8c9a40c64875a03063aa681290c
--- /dev/null
+++ b/tensorflow/contrib/pi_examples/README.md
@@ -0,0 +1,73 @@
+# TensorFlow Raspberry Pi Examples
+
+This folder contains examples of how to build applications for the Raspberry Pi using TensorFlow.
+
+## Building the Examples
+
+ - Follow the Raspberry Pi section of the instructions at [tensorflow/contrib/makefile](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/makefile) to compile a static library containing the core TensorFlow code.
+
+ - Install libjpeg, so we can load image files:
+
+```
+sudo apt-get install -y libjpeg-dev
+```
+
+ - To download the example model you'll need, run these commands:
+ 
+```bash
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_dec_2015_stripped.zip \
+-o /tmp/inception_dec_2015_stripped.zip
+unzip /tmp/inception_dec_2015_stripped.zip \
+-d tensorflow/contrib/pi_examples/label_image/data/
+```
+
+ - From the root of the TensorFlow source tree, run `make -f tensorflow/contrib/pi_examples/label_image/Makefile` to build a basic example.
+
+## Usage
+
+Run `tensorflow/contrib/pi_examples/label_image/gen/bin/label_image` to try out image labeling with the default Grace Hopper image. You should several lines of output, with "Military Uniform" shown as the top result, something like this:
+
+```bash
+I tensorflow/contrib/pi_examples/label_image/label_image.cc:384] Running model succeeded!
+I tensorflow/contrib/pi_examples/label_image/label_image.cc:284] military uniform (866): 0.624293
+I tensorflow/contrib/pi_examples/label_image/label_image.cc:284] suit (794): 0.0473981
+I tensorflow/contrib/pi_examples/label_image/label_image.cc:284] academic gown (896): 0.0280926
+I tensorflow/contrib/pi_examples/label_image/label_image.cc:284] bolo tie (940): 0.0156956
+I tensorflow/contrib/pi_examples/label_image/label_image.cc:284] bearskin (849): 0.0143348
+```
+
+Once you've verified that is working, you can supply your own images with `--image=your_image.jpg`, or even with graphs you've trained yourself with the TensorFlow for Poets tutorial using `--graph=your_graph.pb --input=Mul:0 --output=final_result:0`.
+
+## Camera Example
+
+Once you have the simple example running, you can try out a more complex version that
+reads frames from a camera attached to the Pi. You'll need to install and set up your
+camera module first. The example uses Video4Linux, so you'll need to install that first.
+Here's some commands I found necessary to get that set up, and I found more information
+at this blog post: http://www.richardmudhar.com/blog/2015/02/raspberry-pi-camera-and-motion-out-of-the-box-sparrowcam/
+
+```
+sudo bash -c "echo 'bcm2835-v4l2' >> /etc/modules"
+sudo apt-get install libv4l-dev
+```
+
+Once that's working, run the following commands to build and run the camera example:
+
+```bash
+make -f tensorflow/contrib/pi_examples/camera/Makefile
+tensorflow/contrib/pi_examples/camera/gen/bin/camera
+```
+
+You should see it looping over camera frames as they come in, and printing the top labels
+to the command line. This is a great starting point for all sorts of fun image recognition
+applications, especially when you combine it with a custom model you've built using
+something like the TensorFlow for Poets tutorial.
+
+The example is designed to work with the Flite speech synthesis tool, so that your Pi
+can speak any labels that have a high enough score. To enable this, just install the
+Flite package and then pipe the output of the binary you've built, like this:
+
+```
+sudo apt-get install flite
+tensorflow/contrib/pi_examples/camera/gen/bin/camera | xargs -n1 flite -t
+```
diff --git a/tensorflow/contrib/pi_examples/camera/Makefile b/tensorflow/contrib/pi_examples/camera/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..2d1460640055c52cca356b022c63cfd9792866c0
--- /dev/null
+++ b/tensorflow/contrib/pi_examples/camera/Makefile
@@ -0,0 +1,84 @@
+# This Makefile compiles the label_image example for the Raspberry Pi.
+# See tensorflow/contrib/pi_examples/README.md for full build instructions.
+
+# Find where we're running from, so we can store generated files here.
+SCRIPT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+
+# The location of the tensorflow/contrib/makefile directory.
+TFMAKEFILE_DIR := $(SCRIPT_DIR)/../../makefile
+
+# Where compiled objects are stored.
+GENDIR := $(SCRIPT_DIR)/gen/
+OBJDIR := $(GENDIR)obj/
+LIBDIR := $(GENDIR)lib/
+BINDIR := $(GENDIR)bin/
+
+# The expected locations of the TensorFlow library.
+TFLIBDIR := $(TFMAKEFILE_DIR)/gen/lib
+TFLIBS := $(TFLIBDIR)/libtensorflow-core.a
+
+# Where the downloads have been stored.
+DOWNLOADSDIR := $(TFMAKEFILE_DIR)/downloads
+
+# The location of the compiled protobuf headers generated by TensorFlow.
+PBTGENDIR := $(TFMAKEFILE_DIR)/gen/proto_text/
+PROTOGENDIR := $(TFMAKEFILE_DIR)/gen/proto/
+
+# The name of the output program we're compiling.
+EXECUTABLE_NAME := $(BINDIR)/camera
+
+# Settings for the target compiler.
+CXX := gcc
+OPTFLAGS := -O0
+CXXFLAGS := --std=c++11 $(OPTFLAGS)
+LDFLAGS := \
+-L/usr/local/lib \
+-L$(TFLIBDIR) \
+-Wl,--no-whole-archive
+INCLUDES := \
+-I/usr/local/include \
+-I. \
+-I$(DOWNLOADSDIR) \
+-I$(DOWNLOADSDIR)/eigen-latest/ \
+-I$(PROTOGENDIR) \
+-I$(PBTGENDIR)
+LIBS := \
+-lstdc++ \
+-lprotobuf \
+-lv4l2 \
+-Wl,--allow-multiple-definition \
+-Wl,--whole-archive \
+-ltensorflow-core \
+-Wl,--no-whole-archive \
+-ldl \
+-lpthread \
+-lm \
+-ljpeg
+LIBFLAGS :=
+
+EXECUTABLE_SRCS := tensorflow/contrib/pi_examples/camera/camera.cc
+
+# File names of the intermediate files target compilation generates.
+EXECUTABLE_OBJS := $(addprefix $(OBJDIR), $(EXECUTABLE_SRCS:.cc=.o))
+
+.PHONY: clean
+
+# The target that's compiled if there's no command-line arguments.
+all: $(EXECUTABLE_NAME)
+
+# Rules for target compilation.
+
+$(EXECUTABLE_NAME): $(EXECUTABLE_OBJS) $(TFLIBS)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(EXECUTABLE_NAME) $(EXECUTABLE_OBJS) \
+	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
+
+# Matches on C++ source files.
+$(OBJDIR)%.o: %.cc 
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+# Gets rid of all generated files.
+clean:
+	rm -rf $(GENDIR)
diff --git a/tensorflow/contrib/pi_examples/camera/camera.cc b/tensorflow/contrib/pi_examples/camera/camera.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bba110a5290334538ab069774e2eaeccb58d99d
--- /dev/null
+++ b/tensorflow/contrib/pi_examples/camera/camera.cc
@@ -0,0 +1,533 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Full build instructions are at tensorflow/contrib/pi_examples/README.md.
+
+#include <errno.h>
+#include <fcntl.h>
+#include <fstream>
+#include <libv4l2.h>
+#include <linux/videodev2.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <vector>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+// These are all common classes it's handy to reference with no namespace.
+using tensorflow::Flag;
+using tensorflow::Tensor;
+using tensorflow::Status;
+using tensorflow::string;
+using tensorflow::int32;
+
+// Used to store the memory-mapped buffers we use for capture.
+struct CameraBuffer {
+  void* start;
+  size_t length;
+};
+
+// Wrapper around camera command sending.
+Status SendCameraCommand(int fh, int request, void* arg) {
+  int r;
+  do {
+    r = v4l2_ioctl(fh, request, arg);
+  } while (r == -1 && ((errno == EINTR) || (errno == EAGAIN)));
+  if (r == -1) {
+    LOG(ERROR) << "SendCameraCommand error " << errno << " (" << strerror(errno)
+               << ")";
+    return tensorflow::errors::Unknown("SendCameraCommand error ", errno,
+                                       strerror(errno));
+  }
+  return Status::OK();
+}
+
+Status OpenCamera(int* camera_handle) {
+  const char* dev_name = "/dev/video0";
+  int fd = v4l2_open(dev_name, O_RDWR | O_NONBLOCK, 0);
+  if (fd < 0) {
+    LOG(ERROR) << "Cannot open camera device";
+    return tensorflow::errors::NotFound("V4L2 camera device not found");
+  }
+  *camera_handle = fd;
+  return Status::OK();
+}
+
+Status CloseCamera(int camera_handle) {
+  v4l2_close(camera_handle);
+  return Status::OK();
+}
+
+Status SetCameraFormat(int camera_handle, int wanted_width, int wanted_height) {
+  struct v4l2_format fmt;
+  memset(&fmt, 0, sizeof(fmt));
+  fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  fmt.fmt.pix.width = wanted_width;
+  fmt.fmt.pix.height = wanted_height;
+  fmt.fmt.pix.pixelformat = V4L2_PIX_FMT_RGB24;
+  fmt.fmt.pix.field = V4L2_FIELD_INTERLACED;
+  Status set_format_status =
+      SendCameraCommand(camera_handle, VIDIOC_S_FMT, &fmt);
+  if (!set_format_status.ok()) {
+    LOG(ERROR) << "Setting format failed with " << set_format_status;
+    return set_format_status;
+  }
+  if (fmt.fmt.pix.pixelformat != V4L2_PIX_FMT_RGB24) {
+    LOG(ERROR) << "Libv4l didn't accept RGB24 format. Can't proceed.";
+    return tensorflow::errors::Unknown("Libv4l didn't accept RGB24 format");
+  }
+  if ((fmt.fmt.pix.width != wanted_width) ||
+      (fmt.fmt.pix.height != wanted_height)) {
+    LOG(WARNING) << "Warning: driver is sending image at " << fmt.fmt.pix.width
+                 << "x" << fmt.fmt.pix.height;
+  }
+  return Status::OK();
+}
+
+Status StartCameraCapture(int camera_handle, int buffer_count,
+                          CameraBuffer** buffers) {
+  struct v4l2_requestbuffers req;
+  memset(&req, 0, sizeof(req));
+  req.count = buffer_count;
+  req.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  req.memory = V4L2_MEMORY_MMAP;
+  Status request_buffers_status =
+      SendCameraCommand(camera_handle, VIDIOC_REQBUFS, &req);
+  if (!request_buffers_status.ok()) {
+    LOG(ERROR) << "Request buffers failed with " << request_buffers_status;
+    return request_buffers_status;
+  }
+
+  *buffers = (CameraBuffer*)(calloc(buffer_count, sizeof(*buffers)));
+  for (int n_buffers = 0; n_buffers < buffer_count; ++n_buffers) {
+    struct v4l2_buffer buf;
+    memset(&buf, 0, sizeof(buf));
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_MMAP;
+    buf.index = n_buffers;
+    Status query_buffer_status =
+        SendCameraCommand(camera_handle, VIDIOC_QUERYBUF, &buf);
+    if (!query_buffer_status.ok()) {
+      LOG(ERROR) << "Query buffer failed with " << query_buffer_status;
+      return query_buffer_status;
+    }
+    (*buffers)[n_buffers].length = buf.length;
+    (*buffers)[n_buffers].start =
+        v4l2_mmap(NULL, buf.length, PROT_READ | PROT_WRITE, MAP_SHARED,
+                  camera_handle, buf.m.offset);
+
+    if (MAP_FAILED == (*buffers)[n_buffers].start) {
+      LOG(ERROR) << "Memory-mapping buffer failed";
+      return tensorflow::errors::Unknown("Memory-mapping buffer failed");
+    }
+  }
+
+  for (int i = 0; i < buffer_count; ++i) {
+    struct v4l2_buffer buf;
+    memset(&buf, 0, sizeof(buf));
+    buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+    buf.memory = V4L2_MEMORY_MMAP;
+    buf.index = i;
+    Status set_buffer_status =
+        SendCameraCommand(camera_handle, VIDIOC_QBUF, &buf);
+    if (!set_buffer_status.ok()) {
+      LOG(ERROR) << "Set buffer failed with " << set_buffer_status;
+      return set_buffer_status;
+    }
+  }
+
+  enum v4l2_buf_type type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  Status stream_on_status =
+      SendCameraCommand(camera_handle, VIDIOC_STREAMON, &type);
+  if (!stream_on_status.ok()) {
+    LOG(ERROR) << "Turning stream on failed with " << stream_on_status;
+    return stream_on_status;
+  }
+  return Status::OK();
+}
+
+Status EndCameraCapture(int camera_handle, CameraBuffer* buffers,
+                        int buffer_count) {
+  enum v4l2_buf_type type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  Status stream_off_status =
+      SendCameraCommand(camera_handle, VIDIOC_STREAMOFF, &type);
+  if (!stream_off_status.ok()) {
+    LOG(ERROR) << "Turning stream off failed with " << stream_off_status;
+    return stream_off_status;
+  }
+  for (int i = 0; i < buffer_count; ++i)
+    v4l2_munmap(buffers[i].start, buffers[i].length);
+  return Status::OK();
+}
+
+Status CaptureNextFrame(int camera_handle, CameraBuffer* buffers,
+                        uint8_t** frame_data, int* frame_data_size,
+                        v4l2_buffer* buf) {
+  int r;
+  do {
+    fd_set fds;
+    FD_ZERO(&fds);
+    FD_SET(camera_handle, &fds);
+    struct timeval tv;
+    tv.tv_sec = 2;
+    tv.tv_usec = 0;
+    r = select(camera_handle + 1, &fds, NULL, NULL, &tv);
+  } while ((r == -1 && (errno = EINTR)));
+  if (r == -1) {
+    LOG(ERROR) << "select() failed while waiting for the camera with " << errno;
+    return tensorflow::errors::Unknown(
+        "CaptureCameraFrame: select() failed with", errno);
+  }
+
+  memset(buf, 0, sizeof(*buf));
+  buf->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+  buf->memory = V4L2_MEMORY_MMAP;
+  Status get_buffer_status =
+      SendCameraCommand(camera_handle, VIDIOC_DQBUF, buf);
+  if (!get_buffer_status.ok()) {
+    LOG(ERROR) << "Get buffer failed with " << get_buffer_status;
+    return get_buffer_status;
+  }
+
+  *frame_data = static_cast<uint8_t*>(buffers[buf->index].start);
+  *frame_data_size = buf->bytesused;
+
+  return Status::OK();
+}
+
+Status ReleaseFrame(int camera_handle, v4l2_buffer* buf) {
+  Status release_buffer_status =
+      SendCameraCommand(camera_handle, VIDIOC_QBUF, buf);
+  if (!release_buffer_status.ok()) {
+    LOG(ERROR) << "Release buffer failed with " << release_buffer_status;
+    return release_buffer_status;
+  }
+}
+
+// Reads a model graph definition from disk, and creates a session object you
+// can use to run it.
+Status LoadGraph(string graph_file_name,
+                 std::unique_ptr<tensorflow::Session>* session) {
+  tensorflow::GraphDef graph_def;
+  Status load_graph_status =
+      ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def);
+  if (!load_graph_status.ok()) {
+    return tensorflow::errors::NotFound("Failed to load compute graph at '",
+                                        graph_file_name, "'");
+  }
+  session->reset(tensorflow::NewSession(tensorflow::SessionOptions()));
+  Status session_create_status = (*session)->Create(graph_def);
+  if (!session_create_status.ok()) {
+    return session_create_status;
+  }
+  return Status::OK();
+}
+
+// Analyzes the output of the Inception graph to retrieve the highest scores and
+// their positions in the tensor, which correspond to categories.
+Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
+                    Tensor* out_indices, Tensor* out_scores) {
+  const Tensor& unsorted_scores_tensor = outputs[0];
+  auto unsorted_scores_flat = unsorted_scores_tensor.flat<float>();
+  std::vector<std::pair<int, float>> scores;
+  for (int i = 0; i < unsorted_scores_flat.size(); ++i) {
+    scores.push_back(std::pair<int, float>({i, unsorted_scores_flat(i)}));
+  }
+  std::sort(scores.begin(), scores.end(),
+            [](const std::pair<int, float>& left,
+               const std::pair<int, float>& right) {
+              return left.second > right.second;
+            });
+  scores.resize(how_many_labels);
+  Tensor sorted_indices(tensorflow::DT_INT32, {scores.size()});
+  Tensor sorted_scores(tensorflow::DT_FLOAT, {scores.size()});
+  for (int i = 0; i < scores.size(); ++i) {
+    sorted_indices.flat<int>()(i) = scores[i].first;
+    sorted_scores.flat<float>()(i) = scores[i].second;
+  }
+  *out_indices = sorted_indices;
+  *out_scores = sorted_scores;
+  return Status::OK();
+}
+
+// Takes a file name, and loads a list of labels from it, one per line, and
+// returns a vector of the strings. It pads with empty strings so the length
+// of the result is a multiple of 16, because our model expects that.
+Status ReadLabelsFile(string file_name, std::vector<string>* result,
+                      size_t* found_label_count) {
+  std::ifstream file(file_name);
+  if (!file) {
+    return tensorflow::errors::NotFound("Labels file ", file_name,
+                                        " not found.");
+  }
+  result->clear();
+  string line;
+  while (std::getline(file, line)) {
+    result->push_back(line);
+  }
+  *found_label_count = result->size();
+  const int padding = 16;
+  while (result->size() % padding) {
+    result->emplace_back();
+  }
+  return Status::OK();
+}
+
+// Given the output of a model run, and the name of a file containing the labels
+// this prints out the top five highest-scoring values.
+Status PrintTopLabels(const std::vector<Tensor>& outputs,
+                      const std::vector<string>& labels, int label_count,
+                      float print_threshold) {
+  const int how_many_labels = std::min(5, static_cast<int>(label_count));
+  Tensor indices;
+  Tensor scores;
+  TF_RETURN_IF_ERROR(GetTopLabels(outputs, how_many_labels, &indices, &scores));
+  tensorflow::TTypes<float>::Flat scores_flat = scores.flat<float>();
+  tensorflow::TTypes<int32>::Flat indices_flat = indices.flat<int32>();
+  for (int pos = 0; pos < how_many_labels; ++pos) {
+    const int label_index = indices_flat(pos);
+    const float score = scores_flat(pos);
+    LOG(INFO) << labels[label_index] << " (" << label_index << "): " << score;
+    // Print the top label to stdout if it's above a threshold.
+    if ((pos == 0) && (score > print_threshold)) {
+      std::cout << labels[label_index] << std::endl;
+    }
+  }
+  return Status::OK();
+}
+
+// Given an image buffer, resize it to the requested size, and then scale the
+// values as desired.
+Status TensorFromFrame(uint8_t* image_data, int image_width, int image_height,
+                       int image_channels, const int wanted_height,
+                       const int wanted_width, const float input_mean,
+                       const float input_std,
+                       std::vector<Tensor>* out_tensors) {
+  const int wanted_channels = 3;
+  if (image_channels < wanted_channels) {
+    return tensorflow::errors::FailedPrecondition(
+        "Image needs to have at least ", wanted_channels, " but only has ",
+        image_channels);
+  }
+  // In these loops, we convert the eight-bit data in the image into float,
+  // resize it using bilinear filtering, and scale it numerically to the float
+  // range that the model expects (given by input_mean and input_std).
+  tensorflow::Tensor image_tensor(
+      tensorflow::DT_FLOAT,
+      tensorflow::TensorShape(
+          {1, wanted_height, wanted_width, wanted_channels}));
+  auto image_tensor_mapped = image_tensor.tensor<float, 4>();
+  tensorflow::uint8* in = image_data;
+  float* out = image_tensor_mapped.data();
+  const size_t image_rowlen = image_width * image_channels;
+  const float width_scale = static_cast<float>(image_width) / wanted_width;
+  const float height_scale = static_cast<float>(image_height) / wanted_height;
+  for (int y = 0; y < wanted_height; ++y) {
+    const float in_y = y * height_scale;
+    const int top_y_index = static_cast<int>(floorf(in_y));
+    const int bottom_y_index =
+        std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
+    const float y_lerp = in_y - top_y_index;
+    tensorflow::uint8* in_top_row = in + (top_y_index * image_rowlen);
+    tensorflow::uint8* in_bottom_row = in + (bottom_y_index * image_rowlen);
+    float* out_row = out + (y * wanted_width * wanted_channels);
+    for (int x = 0; x < wanted_width; ++x) {
+      const float in_x = x * width_scale;
+      const int left_x_index = static_cast<int>(floorf(in_x));
+      const int right_x_index =
+          std::min(static_cast<int>(ceilf(in_x)), (image_width - 1));
+      tensorflow::uint8* in_top_left_pixel =
+          in_top_row + (left_x_index * wanted_channels);
+      tensorflow::uint8* in_top_right_pixel =
+          in_top_row + (right_x_index * wanted_channels);
+      tensorflow::uint8* in_bottom_left_pixel =
+          in_bottom_row + (left_x_index * wanted_channels);
+      tensorflow::uint8* in_bottom_right_pixel =
+          in_bottom_row + (right_x_index * wanted_channels);
+      const float x_lerp = in_x - left_x_index;
+      float* out_pixel = out_row + (x * wanted_channels);
+      for (int c = 0; c < wanted_channels; ++c) {
+        const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
+        const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
+        const float bottom_left((in_bottom_left_pixel[c] - input_mean) /
+                                input_std);
+        const float bottom_right((in_bottom_right_pixel[c] - input_mean) /
+                                 input_std);
+        const float top = top_left + (top_right - top_left) * x_lerp;
+        const float bottom =
+            bottom_left + (bottom_right - bottom_left) * x_lerp;
+        out_pixel[c] = top + (bottom - top) * y_lerp;
+      }
+    }
+  }
+
+  out_tensors->push_back(image_tensor);
+  return Status::OK();
+}
+
+int main(int argc, char** argv) {
+  string graph =
+      "tensorflow/contrib/pi_examples/label_image/data/"
+      "tensorflow_inception_stripped.pb";
+  string labels_file_name =
+      "tensorflow/contrib/pi_examples/label_image/data/"
+      "imagenet_comp_graph_label_strings.txt";
+  int32 input_width = 299;
+  int32 input_height = 299;
+  int32 input_mean = 128;
+  int32 input_std = 128;
+  string input_layer = "Mul";
+  string output_layer = "softmax";
+  int32 video_width = 640;
+  int32 video_height = 480;
+  int print_threshold = 50;
+  string root_dir = "";
+  const bool parse_result = tensorflow::ParseFlags(
+      &argc, argv, {Flag("graph", &graph),                      //
+                    Flag("labels", &labels_file_name),          //
+                    Flag("input_width", &input_width),          //
+                    Flag("input_height", &input_height),        //
+                    Flag("input_mean", &input_mean),            //
+                    Flag("input_std", &input_std),              //
+                    Flag("input_layer", &input_layer),          //
+                    Flag("output_layer", &output_layer),        //
+                    Flag("video_width", &video_width),          //
+                    Flag("video_height", &video_height),        //
+                    Flag("print_threshold", &print_threshold),  //
+                    Flag("root_dir", &root_dir)});
+  if (!parse_result) {
+    LOG(ERROR) << "Error parsing command-line flags.";
+    return -1;
+  }
+
+  // First we load and initialize the model.
+  std::unique_ptr<tensorflow::Session> session;
+  string graph_path = tensorflow::io::JoinPath(root_dir, graph);
+  Status load_graph_status = LoadGraph(graph_path, &session);
+  if (!load_graph_status.ok()) {
+    LOG(ERROR) << load_graph_status;
+    return -1;
+  }
+
+  std::vector<string> labels;
+  size_t label_count;
+  Status read_labels_status =
+      ReadLabelsFile(labels_file_name, &labels, &label_count);
+  if (!read_labels_status.ok()) {
+    LOG(ERROR) << read_labels_status;
+    return -1;
+  }
+
+  int camera_handle;
+  Status open_status = OpenCamera(&camera_handle);
+  if (!open_status.ok()) {
+    LOG(ERROR) << "OpenCamera failed with " << open_status;
+    return -1;
+  }
+
+  Status format_status =
+      SetCameraFormat(camera_handle, video_width, video_height);
+  if (!format_status.ok()) {
+    LOG(ERROR) << "SetCameraFormat failed with " << format_status;
+    return -1;
+  }
+
+  const int how_many_buffers = 2;
+  CameraBuffer* buffers;
+  Status start_capture_status =
+      StartCameraCapture(camera_handle, how_many_buffers, &buffers);
+  if (!start_capture_status.ok()) {
+    LOG(ERROR) << "StartCameraCapture failed with " << start_capture_status;
+    return -1;
+  }
+
+  for (int i = 0; i < 200; i++) {
+    uint8_t* frame_data;
+    int frame_data_size;
+    v4l2_buffer buf;
+    Status capture_next_status = CaptureNextFrame(
+        camera_handle, buffers, &frame_data, &frame_data_size, &buf);
+    if (!capture_next_status.ok()) {
+      LOG(ERROR) << "CaptureNextFrame failed with " << capture_next_status;
+      return -1;
+    }
+
+    std::vector<Tensor> resized_tensors;
+    Status tensor_from_frame_status =
+        TensorFromFrame(frame_data, video_width, video_height, 3, input_height,
+                        input_width, input_mean, input_std, &resized_tensors);
+    if (!tensor_from_frame_status.ok()) {
+      LOG(ERROR) << tensor_from_frame_status;
+      return -1;
+    }
+    const Tensor& resized_tensor = resized_tensors[0];
+
+    Status release_frame_status = ReleaseFrame(camera_handle, &buf);
+    if (!release_frame_status.ok()) {
+      LOG(ERROR) << "ReleaseFrame failed with " << release_frame_status;
+      return -1;
+    }
+
+    // Actually run the image through the model.
+    std::vector<Tensor> outputs;
+    Status run_status = session->Run({{input_layer, resized_tensor}},
+                                     {output_layer}, {}, &outputs);
+    if (!run_status.ok()) {
+      LOG(ERROR) << "Running model failed: " << run_status;
+      return -1;
+    }
+
+    // Do something interesting with the results we've generated.
+    Status print_status =
+        PrintTopLabels(outputs, labels, label_count, print_threshold * 0.01f);
+    if (!print_status.ok()) {
+      LOG(ERROR) << "Running print failed: " << print_status;
+      return -1;
+    }
+  }
+
+  Status end_capture_status =
+      EndCameraCapture(camera_handle, buffers, how_many_buffers);
+  if (!end_capture_status.ok()) {
+    LOG(ERROR) << "EndCameraCapture failed with " << end_capture_status;
+    return -1;
+  }
+
+  Status close_status = CloseCamera(camera_handle);
+  if (!close_status.ok()) {
+    LOG(ERROR) << "CloseCamera failed with " << open_status;
+    return -1;
+  }
+
+  return 0;
+}
diff --git a/tensorflow/contrib/pi_examples/label_image/Makefile b/tensorflow/contrib/pi_examples/label_image/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..1f310ec93bb17346d3b995cb0a86dff76019ced6
--- /dev/null
+++ b/tensorflow/contrib/pi_examples/label_image/Makefile
@@ -0,0 +1,83 @@
+# This Makefile compiles the label_image example for the Raspberry Pi.
+# See tensorflow/contrib/pi_examples/README.md for full build instructions.
+
+# Find where we're running from, so we can store generated files here.
+SCRIPT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
+
+# The location of the tensorflow/contrib/makefile directory.
+TFMAKEFILE_DIR := $(SCRIPT_DIR)/../../makefile
+
+# Where compiled objects are stored.
+GENDIR := $(SCRIPT_DIR)/gen/
+OBJDIR := $(GENDIR)obj/
+LIBDIR := $(GENDIR)lib/
+BINDIR := $(GENDIR)bin/
+
+# The expected locations of the TensorFlow library.
+TFLIBDIR := $(TFMAKEFILE_DIR)/gen/lib
+TFLIBS := $(TFLIBDIR)/libtensorflow-core.a
+
+# Where the downloads have been stored.
+DOWNLOADSDIR := $(TFMAKEFILE_DIR)/downloads
+
+# The location of the compiled protobuf headers generated by TensorFlow.
+PBTGENDIR := $(TFMAKEFILE_DIR)/gen/proto_text/
+PROTOGENDIR := $(TFMAKEFILE_DIR)/gen/proto/
+
+# The name of the output program we're compiling.
+EXECUTABLE_NAME := $(BINDIR)/label_image
+
+# Settings for the target compiler.
+CXX := gcc
+OPTFLAGS := -O0
+CXXFLAGS := --std=c++11 $(OPTFLAGS)
+LDFLAGS := \
+-L/usr/local/lib \
+-L$(TFLIBDIR) \
+-Wl,--no-whole-archive
+INCLUDES := \
+-I/usr/local/include \
+-I. \
+-I$(DOWNLOADSDIR) \
+-I$(DOWNLOADSDIR)/eigen-latest/ \
+-I$(PROTOGENDIR) \
+-I$(PBTGENDIR)
+LIBS := \
+-lstdc++ \
+-lprotobuf \
+-Wl,--allow-multiple-definition \
+-Wl,--whole-archive \
+-ltensorflow-core \
+-Wl,--no-whole-archive \
+-ldl \
+-lpthread \
+-lm \
+-ljpeg
+LIBFLAGS :=
+
+EXECUTABLE_SRCS := tensorflow/contrib/pi_examples/label_image/label_image.cc
+
+# File names of the intermediate files target compilation generates.
+EXECUTABLE_OBJS := $(addprefix $(OBJDIR), $(EXECUTABLE_SRCS:.cc=.o))
+
+.PHONY: clean
+
+# The target that's compiled if there's no command-line arguments.
+all: $(EXECUTABLE_NAME)
+
+# Rules for target compilation.
+
+$(EXECUTABLE_NAME): $(EXECUTABLE_OBJS) $(TFLIBS)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(EXECUTABLE_NAME) $(EXECUTABLE_OBJS) \
+	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
+
+# Matches on C++ source files.
+$(OBJDIR)%.o: %.cc 
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+# Gets rid of all generated files.
+clean:
+	rm -rf $(GENDIR)
diff --git a/tensorflow/contrib/pi_examples/label_image/data/grace_hopper.jpg b/tensorflow/contrib/pi_examples/label_image/data/grace_hopper.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..478720d6694a56b8962630e6dde8a230fc937049
Binary files /dev/null and b/tensorflow/contrib/pi_examples/label_image/data/grace_hopper.jpg differ
diff --git a/tensorflow/contrib/pi_examples/label_image/label_image.cc b/tensorflow/contrib/pi_examples/label_image/label_image.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70f32f21995c09871552df4399d852d684e917af
--- /dev/null
+++ b/tensorflow/contrib/pi_examples/label_image/label_image.cc
@@ -0,0 +1,397 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A minimal but useful C++ example showing how to load an Imagenet-style object
+// recognition TensorFlow model, prepare input images for it, run them through
+// the graph, and interpret the results.
+//
+// It has been stripped down from the tensorflow/examples/label_image sample
+// code to remove features and ops not included in the mobile/embedded core
+// library available on the Raspberry Pi.
+//
+// Full build instructions are at tensorflow/contrib/pi_examples/README.md.
+
+#include <fstream>
+#include <jpeglib.h>
+#include <setjmp.h>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/default_device.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+// These are all common classes it's handy to reference with no namespace.
+using tensorflow::Flag;
+using tensorflow::Tensor;
+using tensorflow::Status;
+using tensorflow::string;
+using tensorflow::int32;
+
+// Takes a file name, and loads a list of labels from it, one per line, and
+// returns a vector of the strings. It pads with empty strings so the length
+// of the result is a multiple of 16, because our model expects that.
+Status ReadLabelsFile(string file_name, std::vector<string>* result,
+                      size_t* found_label_count) {
+  std::ifstream file(file_name);
+  if (!file) {
+    return tensorflow::errors::NotFound("Labels file ", file_name,
+                                        " not found.");
+  }
+  result->clear();
+  string line;
+  while (std::getline(file, line)) {
+    result->push_back(line);
+  }
+  *found_label_count = result->size();
+  const int padding = 16;
+  while (result->size() % padding) {
+    result->emplace_back();
+  }
+  return Status::OK();
+}
+
+// Error handling for JPEG decoding.
+void CatchError(j_common_ptr cinfo) {
+  (*cinfo->err->output_message)(cinfo);
+  jmp_buf *jpeg_jmpbuf = reinterpret_cast<jmp_buf *>(cinfo->client_data);
+  jpeg_destroy(cinfo);
+  longjmp(*jpeg_jmpbuf, 1);
+}
+
+// Decompresses a JPEG file from disk.
+Status LoadJpegFile(string file_name, std::vector<tensorflow::uint8>* data,
+		    int* width, int* height, int* channels) {
+  struct jpeg_decompress_struct cinfo;
+  FILE * infile;
+  JSAMPARRAY buffer;
+  int row_stride;
+  
+  if ((infile = fopen(file_name.c_str(), "rb")) == NULL) {
+    LOG(ERROR) << "Can't open " << file_name;
+    return tensorflow::errors::NotFound("JPEG file ", file_name,
+					" not found");
+  }
+
+  struct jpeg_error_mgr jerr;
+  jmp_buf jpeg_jmpbuf;  // recovery point in case of error
+  cinfo.err = jpeg_std_error(&jerr);
+  cinfo.client_data = &jpeg_jmpbuf;
+  jerr.error_exit = CatchError;
+  if (setjmp(jpeg_jmpbuf)) {
+    return tensorflow::errors::Unknown("JPEG decoding failed");
+  }
+  
+  jpeg_create_decompress(&cinfo);
+  jpeg_stdio_src(&cinfo, infile);
+  jpeg_read_header(&cinfo, TRUE);
+  jpeg_start_decompress(&cinfo);
+  *width = cinfo.output_width;
+  *height = cinfo.output_height;
+  *channels = cinfo.output_components;
+  data->resize((*height) * (*width) * (*channels));
+
+  row_stride = cinfo.output_width * cinfo.output_components;
+  buffer = (*cinfo.mem->alloc_sarray)
+    ((j_common_ptr) &cinfo, JPOOL_IMAGE, row_stride, 1);
+  while (cinfo.output_scanline < cinfo.output_height) {
+    tensorflow::uint8* row_address = &((*data)[cinfo.output_scanline * row_stride]); 
+    jpeg_read_scanlines(&cinfo, buffer, 1);
+    memcpy(row_address, buffer[0], row_stride);
+  }
+
+  jpeg_finish_decompress(&cinfo);
+  jpeg_destroy_decompress(&cinfo);
+  fclose(infile);  
+  return Status::OK();
+}
+
+// Given an image file name, read in the data, try to decode it as an image,
+// resize it to the requested size, and then scale the values as desired.
+Status ReadTensorFromImageFile(string file_name, const int wanted_height,
+                               const int wanted_width, const float input_mean,
+                               const float input_std,
+                               std::vector<Tensor>* out_tensors) {
+  std::vector<tensorflow::uint8> image_data;
+  int image_width;
+  int image_height;
+  int image_channels;
+  TF_RETURN_IF_ERROR(LoadJpegFile(file_name, &image_data, &image_width,
+				  &image_height, &image_channels));
+  LOG(INFO) << "Loaded JPEG: " << image_width << "x" << image_height
+	    << "x" << image_channels;
+  const int wanted_channels = 3;
+  if (image_channels < wanted_channels) {
+    return tensorflow::errors::FailedPrecondition("Image needs to have at least ",
+						  wanted_channels, " but only has ",
+						  image_channels);
+  }
+  // In these loops, we convert the eight-bit data in the image into float, resize
+  // it using bilinear filtering, and scale it numerically to the float range that
+  // the model expects (given by input_mean and input_std).
+  tensorflow::Tensor image_tensor(
+      tensorflow::DT_FLOAT, tensorflow::TensorShape(
+      {1, wanted_height, wanted_width, wanted_channels}));
+  auto image_tensor_mapped = image_tensor.tensor<float, 4>();
+  tensorflow::uint8* in = image_data.data();
+  float *out = image_tensor_mapped.data();
+  const size_t image_rowlen = image_width * image_channels;
+  const float width_scale = static_cast<float>(image_width) / wanted_width;
+  const float height_scale = static_cast<float>(image_height) / wanted_height;
+  for (int y = 0; y < wanted_height; ++y) {
+    const float in_y = y * height_scale;
+    const int top_y_index = static_cast<int>(floorf(in_y));
+    const int bottom_y_index =
+      std::min(static_cast<int>(ceilf(in_y)), (image_height - 1));
+    const float y_lerp = in_y - top_y_index; 
+    tensorflow::uint8* in_top_row = in + (top_y_index * image_rowlen);
+    tensorflow::uint8* in_bottom_row = in + (bottom_y_index * image_rowlen);
+    float *out_row = out + (y * wanted_width * wanted_channels);
+    for (int x = 0; x < wanted_width; ++x) {
+      const float in_x = x * width_scale;
+      const int left_x_index = static_cast<int>(floorf(in_x));
+      const int right_x_index =
+	std::min(static_cast<int>(ceilf(in_x)), (image_width - 1));
+      tensorflow::uint8* in_top_left_pixel =
+	in_top_row + (left_x_index * wanted_channels);
+      tensorflow::uint8* in_top_right_pixel =
+	in_top_row + (right_x_index * wanted_channels);
+      tensorflow::uint8* in_bottom_left_pixel =
+	in_bottom_row + (left_x_index * wanted_channels);
+      tensorflow::uint8* in_bottom_right_pixel =
+	in_bottom_row + (right_x_index * wanted_channels);
+      const float x_lerp = in_x - left_x_index;
+      float *out_pixel = out_row + (x * wanted_channels);
+      for (int c = 0; c < wanted_channels; ++c) {	
+	const float top_left((in_top_left_pixel[c] - input_mean) / input_std);
+	const float top_right((in_top_right_pixel[c] - input_mean) / input_std);
+	const float bottom_left((in_bottom_left_pixel[c] - input_mean) / input_std);
+	const float bottom_right((in_bottom_right_pixel[c] - input_mean) / input_std);
+	const float top = top_left + (top_right - top_left) * x_lerp;
+	const float bottom =
+	  bottom_left + (bottom_right - bottom_left) * x_lerp;
+	out_pixel[c] = top + (bottom - top) * y_lerp;
+      }
+    }
+  }
+  
+  out_tensors->push_back(image_tensor);
+  return Status::OK();
+}
+
+// Reads a model graph definition from disk, and creates a session object you
+// can use to run it.
+Status LoadGraph(string graph_file_name,
+                 std::unique_ptr<tensorflow::Session>* session) {
+  tensorflow::GraphDef graph_def;
+  Status load_graph_status =
+      ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def);
+  if (!load_graph_status.ok()) {
+    return tensorflow::errors::NotFound("Failed to load compute graph at '",
+                                        graph_file_name, "'");
+  }
+  session->reset(tensorflow::NewSession(tensorflow::SessionOptions()));
+  Status session_create_status = (*session)->Create(graph_def);
+  if (!session_create_status.ok()) {
+    return session_create_status;
+  }
+  return Status::OK();
+}
+
+// Analyzes the output of the Inception graph to retrieve the highest scores and
+// their positions in the tensor, which correspond to categories.
+Status GetTopLabels(const std::vector<Tensor>& outputs, int how_many_labels,
+                    Tensor* out_indices, Tensor* out_scores) {
+  const Tensor& unsorted_scores_tensor = outputs[0];
+  auto unsorted_scores_flat = unsorted_scores_tensor.flat<float>();
+  std::vector<std::pair<int, float>> scores;
+  for (int i = 0; i < unsorted_scores_flat.size(); ++i) {
+    scores.push_back(std::pair<int, float>({i, unsorted_scores_flat(i)}));
+  }
+  std::sort(scores.begin(), scores.end(),
+	    [](const std::pair<int, float> &left,
+	       const std::pair<int, float> &right) {
+	      return left.second > right.second;
+	    });
+  scores.resize(how_many_labels);
+  Tensor sorted_indices(tensorflow::DT_INT32, {scores.size()});
+  Tensor sorted_scores(tensorflow::DT_FLOAT, {scores.size()});
+  for (int i = 0; i < scores.size(); ++i) {
+    sorted_indices.flat<int>()(i) = scores[i].first;
+    sorted_scores.flat<float>()(i) = scores[i].second;
+  }
+  *out_indices = sorted_indices;
+  *out_scores = sorted_scores;
+  return Status::OK();
+}
+
+// Given the output of a model run, and the name of a file containing the labels
+// this prints out the top five highest-scoring values.
+Status PrintTopLabels(const std::vector<Tensor>& outputs,
+                      string labels_file_name) {
+  std::vector<string> labels;
+  size_t label_count;
+  Status read_labels_status =
+      ReadLabelsFile(labels_file_name, &labels, &label_count);
+  if (!read_labels_status.ok()) {
+    LOG(ERROR) << read_labels_status;
+    return read_labels_status;
+  }
+  const int how_many_labels = std::min(5, static_cast<int>(label_count));
+  Tensor indices;
+  Tensor scores;
+  TF_RETURN_IF_ERROR(GetTopLabels(outputs, how_many_labels, &indices, &scores));
+  tensorflow::TTypes<float>::Flat scores_flat = scores.flat<float>();
+  tensorflow::TTypes<int32>::Flat indices_flat = indices.flat<int32>();
+  for (int pos = 0; pos < how_many_labels; ++pos) {
+    const int label_index = indices_flat(pos);
+    const float score = scores_flat(pos);
+    LOG(INFO) << labels[label_index] << " (" << label_index << "): " << score;
+  }
+  return Status::OK();
+}
+
+// This is a testing function that returns whether the top label index is the
+// one that's expected.
+Status CheckTopLabel(const std::vector<Tensor>& outputs, int expected,
+                     bool* is_expected) {
+  *is_expected = false;
+  Tensor indices;
+  Tensor scores;
+  const int how_many_labels = 1;
+  TF_RETURN_IF_ERROR(GetTopLabels(outputs, how_many_labels, &indices, &scores));
+  tensorflow::TTypes<int32>::Flat indices_flat = indices.flat<int32>();
+  if (indices_flat(0) != expected) {
+    LOG(ERROR) << "Expected label #" << expected << " but got #"
+               << indices_flat(0);
+    *is_expected = false;
+  } else {
+    *is_expected = true;
+  }
+  return Status::OK();
+}
+
+int main(int argc, char* argv[]) {
+  // These are the command-line flags the program can understand.
+  // They define where the graph and input data is located, and what kind of
+  // input the model expects. If you train your own model, or use something
+  // other than GoogLeNet you'll need to update these.
+  string image = "tensorflow/contrib/pi_examples/label_image/data/"
+      "grace_hopper.jpg";
+  string graph =
+      "tensorflow/contrib/pi_examples/label_image/data/"
+      "tensorflow_inception_stripped.pb";
+  string labels =
+      "tensorflow/contrib/pi_examples/label_image/data/"
+      "imagenet_comp_graph_label_strings.txt";
+  int32 input_width = 299;
+  int32 input_height = 299;
+  int32 input_mean = 128;
+  int32 input_std = 128;
+  string input_layer = "Mul";
+  string output_layer = "softmax";
+  bool self_test = false;
+  string root_dir = "";
+  const bool parse_result = tensorflow::ParseFlags(
+      &argc, argv, {Flag("image", &image),                //
+                    Flag("graph", &graph),                //
+                    Flag("labels", &labels),              //
+                    Flag("input_width", &input_width),    //
+                    Flag("input_height", &input_height),  //
+                    Flag("input_mean", &input_mean),      //
+                    Flag("input_std", &input_std),        //
+                    Flag("input_layer", &input_layer),    //
+                    Flag("output_layer", &output_layer),  //
+                    Flag("self_test", &self_test),        //
+                    Flag("root_dir", &root_dir)});
+  if (!parse_result) {
+    LOG(ERROR) << "Error parsing command-line flags.";
+    return -1;
+  }
+
+  // We need to call this to set up global state for TensorFlow.
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+  if (argc > 1) {
+    LOG(ERROR) << "Unknown argument " << argv[1];
+    return -1;
+  }
+
+  // First we load and initialize the model.
+  std::unique_ptr<tensorflow::Session> session;
+  string graph_path = tensorflow::io::JoinPath(root_dir, graph);
+  Status load_graph_status = LoadGraph(graph_path, &session);
+  if (!load_graph_status.ok()) {
+    LOG(ERROR) << load_graph_status;
+    return -1;
+  }
+
+  // Get the image from disk as a float array of numbers, resized and normalized
+  // to the specifications the main graph expects.
+  std::vector<Tensor> resized_tensors;
+  string image_path = tensorflow::io::JoinPath(root_dir, image);
+  Status read_tensor_status =
+      ReadTensorFromImageFile(image_path, input_height, input_width, input_mean,
+                              input_std, &resized_tensors);
+  if (!read_tensor_status.ok()) {
+    LOG(ERROR) << read_tensor_status;
+    return -1;
+  }
+  const Tensor& resized_tensor = resized_tensors[0];
+
+  // Actually run the image through the model.
+  std::vector<Tensor> outputs;
+  Status run_status = session->Run({{input_layer, resized_tensor}},
+                                   {output_layer}, {}, &outputs);
+  if (!run_status.ok()) {
+    LOG(ERROR) << "Running model failed: " << run_status;
+    return -1;
+  } else {
+    LOG(INFO) << "Running model succeeded!";
+  }
+
+  // This is for automated testing to make sure we get the expected result with
+  // the default settings. We know that label 866 (military uniform) should be
+  // the top label for the Admiral Hopper image.
+  if (self_test) {
+    bool expected_matches;
+    Status check_status = CheckTopLabel(outputs, 866, &expected_matches);
+    if (!check_status.ok()) {
+      LOG(ERROR) << "Running check failed: " << check_status;
+      return -1;
+    }
+    if (!expected_matches) {
+      LOG(ERROR) << "Self-test failed!";
+      return -1;
+    }
+  }
+
+  // Do something interesting with the results we've generated.
+  Status print_status = PrintTopLabels(outputs, labels);
+  if (!print_status.ok()) {
+    LOG(ERROR) << "Running print failed: " << print_status;
+    return -1;
+  }
+
+  return 0;
+}
diff --git a/tensorflow/contrib/quantization/tools/quantize_graph.py b/tensorflow/contrib/quantization/tools/quantize_graph.py
index 34bc61d06d84da7d8e4de4539ea843a3c9810541..3ed2ee07f728a1ab92e54c04625033707eb87f31 100644
--- a/tensorflow/contrib/quantization/tools/quantize_graph.py
+++ b/tensorflow/contrib/quantization/tools/quantize_graph.py
@@ -15,7 +15,7 @@
 r"""Transforms a float-trained graph into an equivalent quantized version.
 
 An example of command-line usage is:
-bazel build tensorflow/contrib/quantization/tools/:quantize_graph \
+bazel build tensorflow/contrib/quantization/tools:quantize_graph \
 && bazel-bin/tensorflow/contrib/quantization/tools/quantize_graph \
 --input=tensorflow_inception_graph.pb
 --output_node_names="softmax2" --print_nodes --output=/tmp/quantized_graph.pb \
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index f8f540c0c7aacbd4d0e8a8575633bcf7365762dd..9823980e835b3042e3d61d7e581a628935ee509a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -101,7 +101,7 @@ class GrpcMasterService : public AsyncServiceInterface {
     }                                                                         \
   } while (0)
 
-  void HandleRPCsLoop() {
+  void HandleRPCsLoop() override {
     ENQUEUE_REQUEST(CreateSession, true);
     ENQUEUE_REQUEST(ExtendSession, false);
     for (int i = 0; i < 100; ++i) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index f8ad42a6915ad6fa0da7fd96d2448b39ba7cb6cd..0d532520cf8bba8858aa0b7ce0a7a3b9b0d54695 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -73,7 +73,7 @@ class GrpcSession : public Session {
              const std::vector<std::pair<string, Tensor> >& inputs,
              const std::vector<string>& output_tensor_names,
              const std::vector<string>& target_node_names,
-             std::vector<Tensor>* outputs, RunMetadata* run_metadata);
+             std::vector<Tensor>* outputs, RunMetadata* run_metadata) override;
 
   Status Extend(const GraphDef& graph) override;
   Status Extend(const RunOptions& run_options, const GraphDef& graph) override;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index bf3f413c66eefc3fbb80d9c9b19ce197e2d0fbf6..bba579a6a8d4d5d234788bfe49159c2e75f483ee 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -108,7 +108,7 @@ class GrpcWorkerService : public AsyncServiceInterface {
   } while (0)
 
   // This method blocks forever handling requests from the completion queue.
-  void HandleRPCsLoop() {
+  void HandleRPCsLoop() override {
     // TODO(mrry): This may require performance engineering. We can
     // add more threads to service the completion queue, and add more
     // of various request types if they are short and frequent.
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 1b30313694f197bcf8671325230acaf529aa3988..8925e9a58f7253c035907b8b70c34ef4b3085bc6 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -837,7 +837,6 @@ struct gemm_pack_rhs<
     EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     const Index packet_cols4 = (cols / 4) * 4;
-    const bool non_standard_patches = rhs.nonStandardPatches();
 
     for (Index j2 = 0; j2 < packet_cols4; j2 += 4) {
       const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
diff --git a/tensorflow/core/kernels/reader_base.h b/tensorflow/core/kernels/reader_base.h
index 8c5bb308ec0023c31f8f155b526c093601cc4a6a..3cb910751d0d6a8bb48dd9aa02616ebf97a0a7de 100644
--- a/tensorflow/core/kernels/reader_base.h
+++ b/tensorflow/core/kernels/reader_base.h
@@ -110,7 +110,7 @@ class ReaderBase : public ReaderInterface {
   // In this implementation all the records come from the same work unit.
   int64 ReadUpTo(const int64 num_records, QueueInterface* queue,
                  std::vector<string>* keys, std::vector<string>* value,
-                 OpKernelContext* context);
+                 OpKernelContext* context) override;
 
   Status Reset() override;
   int64 NumRecordsProduced() override;
diff --git a/tensorflow/core/kernels/sparse_matmul_op.h b/tensorflow/core/kernels/sparse_matmul_op.h
index 613c6a15c5b41a48edaabddd1eedefe94ab28ba4..49c22306a92db0cd17b16be00a14daf31dd0aadb 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.h
+++ b/tensorflow/core/kernels/sparse_matmul_op.h
@@ -40,6 +40,35 @@ EIGEN_DEVICE_FUNC inline Packet pexpand_bf16_u(const Packet& from) {
   return reinterpret_cast<const float&>(tmp);
 }
 
+// Specialization non-scalar version on non-sse.
+#ifndef EIGEN_VECTORIZE_SSE2
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_l(const Packet4f& from) {
+  float r[4];
+  tensorflow::uint32 p[4];
+  pstoreu(r, from);
+  tensorflow::uint32 * ir = reinterpret_cast<tensorflow::uint32 *>(r);
+  p[0] = (ir[0] << 16) & 0xffff0000;
+  p[1] = ir[0]& 0xffff0000;
+  p[2] = (ir[1] << 16) & 0xffff0000;
+  p[3] = ir[1] & 0xffff0000;
+  return ploadu<Packet4f>(reinterpret_cast<float *>(p));
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet4f pexpand_bf16_u(const Packet4f& from) {
+  float r[4];
+  tensorflow::uint32 p[4];
+  pstoreu(r, from);
+  tensorflow::uint32 * ir = reinterpret_cast<tensorflow::uint32 *>(r);
+  p[0] = (ir[2] << 16) & 0xffff0000;
+  p[1] = ir[2] & 0xffff0000;
+  p[2] = (ir[3] << 16) & 0xffff0000;
+  p[3] = ir[3] & 0xffff0000;
+  return ploadu<Packet4f>(reinterpret_cast<float *>(p));
+}
+#endif
+
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pinterleave4x64(const Packet& from) {
   return from;
@@ -72,16 +101,41 @@ template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pload4bf16(
     const typename unpacket_traits<Packet>::type* from) {
   assert(false && "Not applicable to Scalar Values");
-  return *from;
+  return Packet();
 }
 
 template <typename Packet>
 EIGEN_DEVICE_FUNC inline Packet pload2bf16(
     const typename unpacket_traits<Packet>::type* from) {
   assert(false && "Not applicable to Scalar Values");
-  return *from;
+  return Packet();
 }
 
+// Specialization for pload4bf16 and pload2bf16 for non-sse. 
+#ifndef EIGEN_VECTORIZE_SSE2
+template <>
+EIGEN_STRONG_INLINE Packet4f pload4bf16<Packet4f>(const float* from) {
+  tensorflow::uint32 p[4];
+  const tensorflow::uint32* ir = reinterpret_cast<const tensorflow::uint32 *>(from);
+  p[0] = (ir[0] << 16) & 0xffff0000;
+  p[1] = ir[0]& 0xffff0000;
+  p[2] = (ir[1] << 16) & 0xffff0000;
+  p[3] = ir[1] & 0xffff0000;
+  return ploadu<Packet4f>(reinterpret_cast<float *>(p));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload2bf16<Packet4f>(const float* from) {
+  tensorflow::uint32 p[4];
+  const tensorflow::uint32* ir = reinterpret_cast<const tensorflow::uint32 *>(from);
+  p[0] = (ir[0] << 16) & 0xffff0000;
+  p[1] = ir[0]& 0xffff0000;
+  p[2] = (ir[0] << 16) & 0xffff0000;
+  p[3] = ir[0] & 0xffff0000;
+  return ploadu<Packet4f>(reinterpret_cast<float *>(p));  
+}
+#endif
+
 #ifdef EIGEN_VECTORIZE_SSE2
 // For PacketSize of 4 floats the Packet is not modified
 template <>
diff --git a/tensorflow/core/lib/core/arena.cc b/tensorflow/core/lib/core/arena.cc
index 5da991084c761521ee1693150d526691980d19bb..403a7cf0ea49720840738a9ea473d84db14533c1 100644
--- a/tensorflow/core/lib/core/arena.cc
+++ b/tensorflow/core/lib/core/arena.cc
@@ -35,8 +35,6 @@ limitations under the License.
 namespace tensorflow {
 namespace core {
 
-static const int kPageSize = getpagesize();
-
 // ----------------------------------------------------------------------
 // Arena::Arena()
 // Arena::~Arena()
diff --git a/tensorflow/core/lib/io/record_reader.cc b/tensorflow/core/lib/io/record_reader.cc
index eb194a14d4494ba71486d751f5f98f0125774ea1..73b0280a8f0d2b38f2dbd6f82f4ad0a85e74eaf0 100644
--- a/tensorflow/core/lib/io/record_reader.cc
+++ b/tensorflow/core/lib/io/record_reader.cc
@@ -29,9 +29,14 @@ RecordReader::RecordReader(RandomAccessFile* file,
                            const RecordReaderOptions& options)
     : src_(file), options_(options) {
   if (options.compression_type == RecordReaderOptions::ZLIB_COMPRESSION) {
+// We don't have zlib available on all embedded platforms, so fail.
+#if defined(IS_SLIM_BUILD)
+    LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
+#else   // IS_SLIM_BUILD
     zlib_input_buffer_.reset(new ZlibInputBuffer(
         src_, options.zlib_options.input_buffer_size,
         options.zlib_options.output_buffer_size, options.zlib_options));
+#endif  // IS_SLIM_BUILD
   } else if (options.compression_type == RecordReaderOptions::NONE) {
     // Nothing to do.
   } else {
@@ -53,6 +58,7 @@ Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
   const size_t expected = n + sizeof(uint32);
   storage->resize(expected);
 
+#if !defined(IS_SLIM_BUILD)
   if (zlib_input_buffer_) {
     // If we have a zlib compressed buffer, we assume that the
     // file is being read sequentially, and we use the underlying
@@ -77,6 +83,7 @@ Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
     }
     *result = StringPiece(storage->data(), n);
   } else {
+#endif  // IS_SLIM_BUILD
     // This version supports reading from arbitrary offsets
     // since we are accessing the random access file directly.
     StringPiece data;
@@ -93,7 +100,9 @@ Status RecordReader::ReadChecksummed(uint64 offset, size_t n,
       return errors::DataLoss("corrupted record at ", offset);
     }
     *result = StringPiece(data.data(), n);
+#if !defined(IS_SLIM_BUILD)
   }
+#endif  // IS_SLIM_BUILD
 
   return Status::OK();
 }
diff --git a/tensorflow/core/lib/io/record_reader.h b/tensorflow/core/lib/io/record_reader.h
index b4c56451be8675bdb72061cfeeaec90ad830bfa5..e6e2a8c8abb1e462404ce75f3c5f58f87c2e04ed 100644
--- a/tensorflow/core/lib/io/record_reader.h
+++ b/tensorflow/core/lib/io/record_reader.h
@@ -19,8 +19,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
+#if !defined(IS_SLIM_BUILD)
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_inputbuffer.h"
+#endif  // IS_SLIM_BUILD
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -35,8 +37,10 @@ class RecordReaderOptions {
   enum CompressionType { NONE = 0, ZLIB_COMPRESSION = 1 };
   CompressionType compression_type = NONE;
 
+#if !defined(IS_SLIM_BUILD)
   // Options specific to zlib compression.
   ZlibCompressionOptions zlib_options;
+#endif  // IS_SLIM_BUILD
 };
 
 class RecordReader {
@@ -59,7 +63,9 @@ class RecordReader {
 
   RandomAccessFile* src_;
   RecordReaderOptions options_;
+#if !defined(IS_SLIM_BUILD)
   std::unique_ptr<ZlibInputBuffer> zlib_input_buffer_;
+#endif  // IS_SLIM_BUILD
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecordReader);
 };
diff --git a/tensorflow/core/lib/io/record_writer.cc b/tensorflow/core/lib/io/record_writer.cc
index 7993f6ca20ae2394373c5ec200907ac63650ccf0..25873b83ba36dfdb0ef83a0f6d69a175398d1c8f 100644
--- a/tensorflow/core/lib/io/record_writer.cc
+++ b/tensorflow/core/lib/io/record_writer.cc
@@ -26,9 +26,14 @@ RecordWriter::RecordWriter(WritableFile* dest,
                            const RecordWriterOptions& options)
     : dest_(dest), options_(options) {
   if (options.compression_type == RecordWriterOptions::ZLIB_COMPRESSION) {
+// We don't have zlib available on all embedded platforms, so fail.
+#if defined(IS_SLIM_BUILD)
+    LOG(FATAL) << "Zlib compression is unsupported on mobile platforms.";
+#else   // IS_SLIM_BUILD
     zlib_output_buffer_.reset(new ZlibOutputBuffer(
         dest_, options.zlib_options.input_buffer_size,
         options.zlib_options.output_buffer_size, options.zlib_options));
+#endif  // IS_SLIM_BUILD
   } else if (options.compression_type == RecordWriterOptions::NONE) {
     // Nothing to do
   } else {
@@ -37,12 +42,14 @@ RecordWriter::RecordWriter(WritableFile* dest,
 }
 
 RecordWriter::~RecordWriter() {
+#if !defined(IS_SLIM_BUILD)
   if (zlib_output_buffer_) {
     Status s = zlib_output_buffer_->Close();
     if (!s.ok()) {
       LOG(ERROR) << "Could not finish writing file: " << s;
     }
   }
+#endif  // IS_SLIM_BUILD
 }
 
 static uint32 MaskedCrc(const char* data, size_t n) {
@@ -62,16 +69,20 @@ Status RecordWriter::WriteRecord(StringPiece data) {
   char footer[sizeof(uint32)];
   core::EncodeFixed32(footer, MaskedCrc(data.data(), data.size()));
 
+#if !defined(IS_SLIM_BUILD)
   if (zlib_output_buffer_) {
     TF_RETURN_IF_ERROR(
         zlib_output_buffer_->Write(StringPiece(header, sizeof(header))));
     TF_RETURN_IF_ERROR(zlib_output_buffer_->Write(data));
     return zlib_output_buffer_->Write(StringPiece(footer, sizeof(footer)));
   } else {
+#endif  // IS_SLIM_BUILD
     TF_RETURN_IF_ERROR(dest_->Append(StringPiece(header, sizeof(header))));
     TF_RETURN_IF_ERROR(dest_->Append(data));
     return dest_->Append(StringPiece(footer, sizeof(footer)));
+#if !defined(IS_SLIM_BUILD)
   }
+#endif  // IS_SLIM_BUILD
 }
 
 }  // namespace io
diff --git a/tensorflow/core/lib/io/record_writer.h b/tensorflow/core/lib/io/record_writer.h
index 2344df3b25adbf0158b3f2d8f18147e80d764c2c..3d42a281de9ccc27fe7b1ab092242c4c5ae04cb8 100644
--- a/tensorflow/core/lib/io/record_writer.h
+++ b/tensorflow/core/lib/io/record_writer.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#if !defined(IS_SLIM_BUILD)
 #include "tensorflow/core/lib/io/zlib_compression_options.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
+#endif  // IS_SLIM_BUILD
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -34,8 +36,10 @@ class RecordWriterOptions {
   enum CompressionType { NONE = 0, ZLIB_COMPRESSION = 1 };
   CompressionType compression_type = NONE;
 
-  // Options specific to zlib compression.
+// Options specific to zlib compression.
+#if !defined(IS_SLIM_BUILD)
   ZlibCompressionOptions zlib_options;
+#endif  // IS_SLIM_BUILD
 };
 
 class RecordWriter {
@@ -54,9 +58,11 @@ class RecordWriter {
   // RecordWriter to the WritableFile. Does *not* flush the
   // WritableFile.
   Status Flush() {
+#if !defined(IS_SLIM_BUILD)
     if (zlib_output_buffer_) {
       return zlib_output_buffer_->Flush();
     }
+#endif  // IS_SLIM_BUILD
 
     return Status::OK();
   }
@@ -64,7 +70,9 @@ class RecordWriter {
  private:
   WritableFile* const dest_;
   RecordWriterOptions options_;
+#if !defined(IS_SLIM_BUILD)
   std::unique_ptr<ZlibOutputBuffer> zlib_output_buffer_;
+#endif  // IS_SLIM_BUILD
 
   TF_DISALLOW_COPY_AND_ASSIGN(RecordWriter);
 };
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index 9b0137657765105ce3fb1aae359f91963aa864b7..6a3024d5bc26c465dc9ab4653867d401487089c7 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Simple transfer learning with an Inception v3 architecture model.
+"""Simple transfer learning with an Inception v3 architecture model which
+displays summaries in TensorBoard.
 
 This example shows how to take a Inception v3 architecture model trained on
 ImageNet images, and train a new top layer that can recognize other classes of
@@ -49,6 +50,15 @@ in.
 This produces a new model file that can be loaded and run by any TensorFlow
 program, for example the label_image sample code.
 
+
+To use with TensorBoard:
+
+By default, this script will log summaries to /tmp/retrain_logs directory
+
+Visualize the summaries with this command:
+
+tensorboard --logdir /tmp/retrain_logs
+
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -81,6 +91,8 @@ tf.app.flags.DEFINE_string('output_graph', '/tmp/output_graph.pb',
                            """Where to save the trained graph.""")
 tf.app.flags.DEFINE_string('output_labels', '/tmp/output_labels.txt',
                            """Where to save the trained graph's labels.""")
+tf.app.flags.DEFINE_string('summaries_dir', '/tmp/retrain_logs',
+                          """Where to save summary logs for TensorBoard.""")
 
 # Details of the training configuration.
 tf.app.flags.DEFINE_integer('how_many_training_steps', 4000,
@@ -650,6 +662,19 @@ def add_input_distortions(flip_left_right, random_crop, random_scale,
   return jpeg_data, distort_result
 
 
+def variable_summaries(var, name):
+  """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
+  with tf.name_scope('summaries'):
+    mean = tf.reduce_mean(var)
+    tf.scalar_summary('mean/' + name, mean)
+    with tf.name_scope('stddev'):
+      stddev = tf.sqrt(tf.reduce_sum(tf.square(var - mean)))
+    tf.scalar_summary('sttdev/' + name, stddev)
+    tf.scalar_summary('max/' + name, tf.reduce_max(var))
+    tf.scalar_summary('min/' + name, tf.reduce_min(var))
+    tf.histogram_summary(name, var)
+
+
 def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor):
   """Adds a new softmax and fully-connected layer for training.
 
@@ -670,24 +695,43 @@ def add_final_training_ops(class_count, final_tensor_name, bottleneck_tensor):
     The tensors for the training and cross entropy results, and tensors for the
     bottleneck input and ground truth input.
   """
-  bottleneck_input = tf.placeholder_with_default(
-      bottleneck_tensor, shape=[None, BOTTLENECK_TENSOR_SIZE],
-      name='BottleneckInputPlaceholder')
-  layer_weights = tf.Variable(
-      tf.truncated_normal([BOTTLENECK_TENSOR_SIZE, class_count], stddev=0.001),
-      name='final_weights')
-  layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases')
-  logits = tf.matmul(bottleneck_input, layer_weights,
-                     name='final_matmul') + layer_biases
+  with tf.name_scope('input'):
+    bottleneck_input = tf.placeholder_with_default(
+        bottleneck_tensor, shape=[None, BOTTLENECK_TENSOR_SIZE],
+        name='BottleneckInputPlaceholder')
+
+    ground_truth_input = tf.placeholder(tf.float32,
+                                        [None, class_count],
+                                        name='GroundTruthInput')
+
+  # Organizing the following ops as `final_training_ops` so they're easier
+  # to see in TensorBoard
+  layer_name = 'final_training_ops'
+  with tf.name_scope(layer_name):
+    with tf.name_scope('weights'):
+      layer_weights = tf.Variable(tf.truncated_normal([BOTTLENECK_TENSOR_SIZE, class_count], stddev=0.001), name='final_weights')
+      variable_summaries(layer_weights, layer_name + '/weights')
+    with tf.name_scope('biases'):
+      layer_biases = tf.Variable(tf.zeros([class_count]), name='final_biases')
+      variable_summaries(layer_biases, layer_name + '/biases')
+    with tf.name_scope('Wx_plus_b'):
+      logits = tf.matmul(bottleneck_input, layer_weights) + layer_biases
+      tf.histogram_summary(layer_name + '/pre_activations', logits)
+
   final_tensor = tf.nn.softmax(logits, name=final_tensor_name)
-  ground_truth_input = tf.placeholder(tf.float32,
-                                      [None, class_count],
-                                      name='GroundTruthInput')
-  cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
+  tf.histogram_summary(final_tensor_name + '/activations', final_tensor)
+
+  with tf.name_scope('cross_entropy'):
+    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
       logits, ground_truth_input)
-  cross_entropy_mean = tf.reduce_mean(cross_entropy)
-  train_step = tf.train.GradientDescentOptimizer(FLAGS.learning_rate).minimize(
-      cross_entropy_mean)
+    with tf.name_scope('total'):
+      cross_entropy_mean = tf.reduce_mean(cross_entropy)
+    tf.scalar_summary('cross entropy', cross_entropy_mean)
+
+  with tf.name_scope('train'):
+    train_step = tf.train.GradientDescentOptimizer(FLAGS.learning_rate).minimize(
+        cross_entropy_mean)
+
   return (train_step, cross_entropy_mean, bottleneck_input, ground_truth_input,
           final_tensor)
 
@@ -703,13 +747,22 @@ def add_evaluation_step(result_tensor, ground_truth_tensor):
   Returns:
     Nothing.
   """
-  correct_prediction = tf.equal(
-      tf.argmax(result_tensor, 1), tf.argmax(ground_truth_tensor, 1))
-  evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, 'float'))
+  with tf.name_scope('accuracy'):
+    with tf.name_scope('correct_prediction'):
+      correct_prediction = tf.equal(tf.argmax(result_tensor, 1), \
+        tf.argmax(ground_truth_tensor, 1))
+    with tf.name_scope('accuracy'):
+      evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+    tf.scalar_summary('accuracy', evaluation_step)
   return evaluation_step
 
 
 def main(_):
+  # Setup the directory we'll write summaries to for TensorBoard
+  if tf.gfile.Exists(FLAGS.summaries_dir):
+    tf.gfile.DeleteRecursively(FLAGS.summaries_dir)
+  tf.gfile.MakeDirs(FLAGS.summaries_dir)
+
   # Set up the pre-trained graph.
   maybe_download_and_extract()
   graph, bottleneck_tensor, jpeg_data_tensor, resized_image_tensor = (
@@ -750,13 +803,19 @@ def main(_):
                                           FLAGS.final_tensor_name,
                                           bottleneck_tensor)
 
+  # Create the operations we need to evaluate the accuracy of our new layer.
+  evaluation_step = add_evaluation_step(final_tensor, ground_truth_input)
+
+  # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
+  merged = tf.merge_all_summaries()
+  train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',
+                                        sess.graph)
+  validation_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/validation')
+
   # Set up all our weights to their initial default values.
   init = tf.initialize_all_variables()
   sess.run(init)
 
-  # Create the operations we need to evaluate the accuracy of our new layer.
-  evaluation_step = add_evaluation_step(final_tensor, ground_truth_input)
-
   # Run the training for as many cycles as requested on the command line.
   for i in range(FLAGS.how_many_training_steps):
     # Get a catch of input bottleneck values, either calculated fresh every time
@@ -772,10 +831,12 @@ def main(_):
           FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
           bottleneck_tensor)
     # Feed the bottlenecks and ground truth into the graph, and run a training
-    # step.
-    sess.run(train_step,
+    # step. Capture training summaries for TensorBoard with the `merged` op.
+    train_summary, _ = sess.run([merged, train_step],
              feed_dict={bottleneck_input: train_bottlenecks,
                         ground_truth_input: train_ground_truth})
+    train_writer.add_summary(train_summary, i)
+
     # Every so often, print out how well the graph is training.
     is_last_step = (i + 1 == FLAGS.how_many_training_steps)
     if (i % FLAGS.eval_step_interval) == 0 or is_last_step:
@@ -792,10 +853,13 @@ def main(_):
               sess, image_lists, FLAGS.validation_batch_size, 'validation',
               FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
               bottleneck_tensor))
-      validation_accuracy = sess.run(
-          evaluation_step,
+      # Run a validation step and capture training summaries for TensorBoard
+      # with the `merged` op.
+      validation_summary, validation_accuracy = sess.run(
+          [merged, evaluation_step],
           feed_dict={bottleneck_input: validation_bottlenecks,
                      ground_truth_input: validation_ground_truth})
+      validation_writer.add_summary(validation_summary, i)
       print('%s: Step %d: Validation accuracy = %.1f%%' %
             (datetime.now(), i, validation_accuracy * 100))
 
diff --git a/tensorflow/g3doc/api_docs/python/contrib.metrics.md b/tensorflow/g3doc/api_docs/python/contrib.metrics.md
index b7a136f87f7e7a4b86b39f15ce01d670dabca269..aaeed394116639ae61a76323fc0c4525500e7cf2 100644
--- a/tensorflow/g3doc/api_docs/python/contrib.metrics.md
+++ b/tensorflow/g3doc/api_docs/python/contrib.metrics.md
@@ -106,7 +106,7 @@ idempotent operation that simply divides `total` by `count`.
 To facilitate the estimation of the accuracy over a stream of data, the
 function utilizes two operations. First, an `is_correct` operation that
 computes a tensor whose shape matches `predictions` and whose elements are
-set to 1.0 when the corresponding values of `predictions` and `labels match
+set to 1.0 when the corresponding values of `predictions` and `labels` match
 and 0.0 otherwise. Second, an `update_op` operation whose behavior is
 dependent on the value of `weights`. If `weights` is None, then `update_op`
 increments `total` with the number of elements of `predictions` that match
diff --git a/tensorflow/g3doc/api_docs/python/framework.md b/tensorflow/g3doc/api_docs/python/framework.md
index 3a0fc8d28e99042e1034f24deaa2864ecf9cf00f..e3e80da7f18c95c0f05c2310800c5a0e9c56c4c9 100644
--- a/tensorflow/g3doc/api_docs/python/framework.md
+++ b/tensorflow/g3doc/api_docs/python/framework.md
@@ -1340,18 +1340,14 @@ The following `DType` objects are defined:
 * `tf.bfloat16`: 16-bit truncated floating-point.
 * `tf.complex64`: 64-bit single-precision complex.
 * `tf.complex128`: 128-bit double-precision complex.
-
 * `tf.int8`: 8-bit signed integer.
 * `tf.uint8`: 8-bit unsigned integer.
 * `tf.uint16`: 16-bit unsigned integer.
 * `tf.int16`: 16-bit signed integer.
 * `tf.int32`: 32-bit signed integer.
 * `tf.int64`: 64-bit signed integer.
-
 * `tf.bool`: Boolean.
-
 * `tf.string`: String.
-
 * `tf.qint8`: Quantized 8-bit signed integer.
 * `tf.quint8`: Quantized 8-bit unsigned integer.
 * `tf.qint16`: Quantized 16-bit signed integer.
diff --git a/tensorflow/g3doc/get_started/basic_usage.md b/tensorflow/g3doc/get_started/basic_usage.md
index b4289a986d1946b0f83b9840734040017e8210f8..1603df9335ff0c947d2f03aad39a0acdf7c44d43 100644
--- a/tensorflow/g3doc/get_started/basic_usage.md
+++ b/tensorflow/g3doc/get_started/basic_usage.md
@@ -319,6 +319,6 @@ with tf.Session() as sess:
 A `placeholder()` operation generates an error if you do not supply a feed for
 it. See the
 [MNIST fully-connected feed tutorial](../tutorials/mnist/tf/index.md)
-([source code](https://www.tensorflow.org/code/tensorflow/g3doc/tutorials/mnist/fully_connected_feed.py))
+([source code](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/fully_connected_feed.py))
 for a larger-scale example of feeds.
 
diff --git a/tensorflow/g3doc/how_tos/image_retraining/index.md b/tensorflow/g3doc/how_tos/image_retraining/index.md
index 52c1d00a75c5c185d2eb097cd7bbae01770e2c7f..8ebbe57af1ec90ab20d9ba2c1a7d6852f2495f60 100644
--- a/tensorflow/g3doc/how_tos/image_retraining/index.md
+++ b/tensorflow/g3doc/how_tos/image_retraining/index.md
@@ -117,6 +117,22 @@ to run since there's randomness in the training process. This number is based on
 the percent of the images in the test set that are given the correct label
 after the model is fully trained.
 
+## Visualizing the Retraining with TensorBoard
+
+The script includes TensorBoard summaries that make it easier to understand, debug, and optimize the retraining. For example, you can visualize the graph and statistics, such as how the weights or accuracy varied during training.
+
+To launch TensorBoard, run this command during or after retraining:
+
+```sh
+tensorboard --logdir /tmp/retrain_logs
+```
+
+Once TensorBoard is running, navigate your web browser to `localhost:6006` to view the TensorBoard.
+
+The script will log TensorBoard summaries to `/tmp/retrain_logs` by default. You can change the directory with the `--summaries_dir` flag.
+
+The [TensorBoard README](../../../tensorboard/README.md) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
+
 ## Using the Retrained Model
 
 The script will write out a version of the Inception v3 network with a final
diff --git a/tensorflow/g3doc/tutorials/mnist/pros/index.md b/tensorflow/g3doc/tutorials/mnist/pros/index.md
index 73cc87eb57de3cdc152ad34ae7203fab2510d57e..12de1df66cb3101ae18920ec0ac0ac88b846800d 100644
--- a/tensorflow/g3doc/tutorials/mnist/pros/index.md
+++ b/tensorflow/g3doc/tutorials/mnist/pros/index.md
@@ -21,8 +21,8 @@ TensorFlow session.
 
 For your convenience, we've included
 [a script](https://www.tensorflow.org/code/tensorflow/examples/tutorials/mnist/input_data.py)
-which automatically downloads and imports the MNIST dataset. It will create a
-directory `'MNIST_data'` in which to store the data files.
+which will help you download and import the MNIST dataset. Run the following commands to create a
+directory `'MNIST_data'` in the current folder, the data files will be stored inside that directory.
 
 ```python
 from tensorflow.examples.tutorials.mnist import input_data
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 85a5cc7444f66172a1f3e992791fa331b0d81ef2..1f29426b4cf5d2c0b28990b429aaf0a05c5dd134 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -34,18 +34,14 @@ class DType(object):
   * `tf.bfloat16`: 16-bit truncated floating-point.
   * `tf.complex64`: 64-bit single-precision complex.
   * `tf.complex128`: 128-bit double-precision complex.
-
   * `tf.int8`: 8-bit signed integer.
   * `tf.uint8`: 8-bit unsigned integer.
   * `tf.uint16`: 16-bit unsigned integer.
   * `tf.int16`: 16-bit signed integer.
   * `tf.int32`: 32-bit signed integer.
   * `tf.int64`: 64-bit signed integer.
-
   * `tf.bool`: Boolean.
-
   * `tf.string`: String.
-
   * `tf.qint8`: Quantized 8-bit signed integer.
   * `tf.quint8`: Quantized 8-bit unsigned integer.
   * `tf.qint16`: Quantized 16-bit signed integer.
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 87e352819635ec9fad15051056ad4402b667d870..4a21d1acc63dd92a6160ad557ab1de932b2889c6 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -115,18 +115,22 @@ class UnaryOpTest(tf.test.TestCase):
                                                     x_init_value=x)
         self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
 
-  def _check(self, result_tensor, result_np, input_sp_t):
+  def _check(self, result_tensor, result_np, input_sp_t, tol):
     self.assertTrue(isinstance(result_tensor, tf.SparseTensor))
     self.assertTrue(isinstance(input_sp_t, tf.SparseTensor))
     self.assertAllEqual(input_sp_t.indices.eval(), result_tensor.indices.eval())
     self.assertAllEqual(input_sp_t.shape.eval(), result_tensor.shape.eval())
-    self.assertAllClose(result_np, result_tensor.values.eval())
+    if tol is None:
+      self.assertAllClose(result_np, result_tensor.values.eval())
+    else:
+      self.assertAllClose(result_np, result_tensor.values.eval(), rtol=tol,
+                          atol=tol)
 
-  def _compareSparseCpu(self, x, np_func, tf_func):
+  def _compareSparseCpu(self, x, np_func, tf_func, tol):
     x_sp, x_sp_vals = _sparsify(x)
     res_np = np_func(x_sp_vals)
     with self.test_session(use_gpu=False):
-      self._check(tf_func(x_sp), res_np, x_sp)
+      self._check(tf_func(x_sp), res_np, x_sp, tol)
 
   def _compareGpu(self, x, np_func, tf_func):
     np_ans = np_func(x)
@@ -139,19 +143,19 @@ class UnaryOpTest(tf.test.TestCase):
       self.assertAllClose(np_ans, tf_gpu)
     # TODO(zhifengc/ke): make gradient checker work on GPU.
 
-  def _compareSparseGpu(self, x, np_func, tf_func):
+  def _compareSparseGpu(self, x, np_func, tf_func, tol):
     x_sp, x_sp_vals = _sparsify(x)
     res_np = np_func(x_sp_vals)
     with self.test_session(use_gpu=True):
-      self._check(tf_func(x_sp), res_np, x_sp)
+      self._check(tf_func(x_sp), res_np, x_sp, tol)
 
   def _compareBoth(self, x, np_func, tf_func):
     self._compareCpu(x, np_func, tf_func)
     self._compareGpu(x, np_func, tf_func)
 
-  def _compareBothSparse(self, x, np_func, tf_func):
-    self._compareSparseCpu(x, np_func, tf_func)
-    self._compareSparseGpu(x, np_func, tf_func)
+  def _compareBothSparse(self, x, np_func, tf_func, tol=None):
+    self._compareSparseCpu(x, np_func, tf_func, tol)
+    self._compareSparseGpu(x, np_func, tf_func, tol)
 
   def _inv(self, x):
     return 1.0 / x
@@ -207,6 +211,8 @@ class UnaryOpTest(tf.test.TestCase):
 
     self._compareBothSparse(x, np.abs, tf.abs)
     self._compareBothSparse(x, np.negative, tf.neg)
+    self._compareBothSparse(x, np.square, tf.square)
+    self._compareBothSparse(z, np.sqrt, tf.sqrt, tol=1e-3)
     self._compareBothSparse(y, np.sign, tf.sign)
 
   def testFloatTanhEdge(self):
@@ -243,6 +249,8 @@ class UnaryOpTest(tf.test.TestCase):
 
     self._compareBothSparse(x, np.abs, tf.abs)
     self._compareBothSparse(x, np.negative, tf.neg)
+    self._compareBothSparse(x, np.square, tf.square)
+    self._compareBothSparse(x, np.sqrt, tf.sqrt, tol=1e-3)
     self._compareBothSparse(x, np.sign, tf.sign)
 
   def testDoubleBasic(self):
@@ -278,6 +286,8 @@ class UnaryOpTest(tf.test.TestCase):
 
     self._compareBothSparse(x, np.abs, tf.abs)
     self._compareBothSparse(x, np.negative, tf.neg)
+    self._compareBothSparse(x, np.square, tf.square)
+    self._compareBothSparse(z, np.sqrt, tf.sqrt, tol=1e-3)
     self._compareBothSparse(y, np.sign, tf.sign)
 
   def testHalfBasic(self):
@@ -308,6 +318,8 @@ class UnaryOpTest(tf.test.TestCase):
 
     self._compareBothSparse(x, np.abs, tf.abs)
     self._compareBothSparse(x, np.negative, tf.neg)
+    self._compareBothSparse(x, np.square, tf.square)
+    self._compareBothSparse(z, np.sqrt, tf.sqrt, tol=1e-3)
     self._compareBothSparse(y, np.sign, tf.sign)
 
   def testInt32Basic(self):
@@ -321,6 +333,7 @@ class UnaryOpTest(tf.test.TestCase):
 
     self._compareBothSparse(x, np.abs, tf.abs)
     self._compareBothSparse(x, np.negative, tf.neg)
+    self._compareBothSparse(x, np.square, tf.square)
     self._compareBothSparse(x, np.sign, tf.sign)
 
   def testInt64Basic(self):
@@ -335,6 +348,7 @@ class UnaryOpTest(tf.test.TestCase):
 
     self._compareBothSparse(x, np.abs, tf.abs)
     self._compareBothSparse(x, np.negative, tf.neg)
+    self._compareBothSparse(x, np.square, tf.square)
     self._compareBothSparse(x, np.sign, tf.sign)
 
   def testComplex64Basic(self):
@@ -358,6 +372,8 @@ class UnaryOpTest(tf.test.TestCase):
 
     self._compareBothSparse(x, np.abs, tf.abs)
     self._compareBothSparse(x, np.negative, tf.neg)
+    self._compareBothSparse(x, np.square, tf.square)
+    self._compareBothSparse(x, np.sqrt, tf.sqrt, 1e-3)
 
     # Numpy uses an incorrect definition of sign; use the right one instead.
     def complex_sign(x):
@@ -386,6 +402,8 @@ class UnaryOpTest(tf.test.TestCase):
 
     self._compareBothSparse(x, np.abs, tf.abs)
     self._compareBothSparse(x, np.negative, tf.neg)
+    self._compareBothSparse(x, np.square, tf.square)
+    self._compareBothSparse(x, np.sqrt, tf.sqrt, 1e-3)
 
     # Numpy uses an incorrect definition of sign; use the right one instead.
     def complex_sign(x):
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index 4ec455fd61bd5d0a06d48680328547a1c6f16ef6..90678929c0ad2dac9047257e9d71dde9e1f76b12 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -91,6 +91,15 @@ class ShapeOpsTest(tf.test.TestCase):
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
+  def _compareSizeSparse(self, x_np, use_gpu=False):
+    np_ans = np.asarray(np.size(x_np))
+    x_tf, unused_nnz = _sparsify(x_np)
+    with self.test_session(use_gpu=use_gpu):
+      tf_ans = tf.size(x_tf)
+      result = tf_ans.eval()
+    self.assertAllEqual(np_ans, result)
+    self.assertShapeEqual(np_ans, tf_ans)
+
   def _testCpu(self, x):
     self._compareShape(x, use_gpu=False)
     self._compareShapeN(x, use_gpu=False)
@@ -98,6 +107,7 @@ class ShapeOpsTest(tf.test.TestCase):
     self._compareSize(x, use_gpu=False)
     self._compareShapeSparse(x, use_gpu=False)
     self._compareRankSparse(x, use_gpu=False)
+    self._compareSizeSparse(x, use_gpu=False)
 
   def _testGpu(self, x):
     self._compareShape(x, use_gpu=True)
@@ -106,6 +116,7 @@ class ShapeOpsTest(tf.test.TestCase):
     self._compareSize(x, use_gpu=True)
     self._compareShapeSparse(x, use_gpu=True)
     self._compareRankSparse(x, use_gpu=True)
+    self._compareSizeSparse(x, use_gpu=True)
 
   def _testAll(self, x):
     self._testCpu(x)
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 2dbd7b94f26a36e171f67b15746a7feb6a174be8..0167aba0e8602e9089e00f352d7371726d0ed29f 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -125,11 +125,39 @@ def shape(input, name=None):
   """
   with ops.op_scope([input], name, "Shape") as name:
     if isinstance(input, ops.SparseTensor):
-      return input.shape
+      return gen_math_ops.cast(input.shape, dtypes.int32)
     else:
       return gen_array_ops.shape(input, name=name)
 
 
+def size(input, name=None):
+  """Returns the size of a tensor.
+
+  This operation returns an integer representing the number of elements in
+  `input`.
+
+  For example:
+
+  ```python
+  # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+  size(t) ==> 12
+  ```
+
+  Args:
+    input: A `Tensor` or `SparseTensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `int32`.
+  """
+  with ops.op_scope([input], name, "Size") as name:
+    if isinstance(input, ops.SparseTensor):
+      return gen_math_ops._prod(gen_math_ops.cast(input.shape, dtypes.int32), 0,
+                                name=name)
+    else:
+      return gen_array_ops.size(input, name=name)
+
+
 def rank(input, name=None):
   """Returns the rank of a tensor.
 
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index d552221e606d9fb3bfa3febbf3790beda0aabc54..8bfd9ce8bf87674581d147722d8f6a0621088bff 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -60,7 +60,7 @@ def _SumGrad(op, grad):
 
 
 def _MinOrMaxGrad(op, grad):
-  """Gradient for Max or Max. Amazingly it's precisely the same code."""
+  """Gradient for Min or Max. Amazingly it's precisely the same code."""
   input_shape = array_ops.shape(op.inputs[0])
   output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
   y = op.outputs[0]
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 1647ceb1d5959e56cd7ac021906351187d8b2a8d..d27cefc61da5bf0c9a3b9ef43d3c69678ee91fd0 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -306,6 +306,48 @@ def sign(x, name=None):
       return gen_math_ops.sign(x, name=name)
 
 
+def square(x, name=None):
+  """Computes square of x element-wise.
+
+  I.e., \\(y = x * x = x^2\\).
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`. Has the same type as `x`.
+  """
+  with ops.op_scope([x], name, "Square") as name:
+    if isinstance(x, ops.SparseTensor):
+      x_square = gen_math_ops.square(x.values, name=name)
+      return ops.SparseTensor(indices=x.indices, values=x_square, shape=x.shape)
+    else:
+      return gen_math_ops.square(x, name=name)
+
+
+def sqrt(x, name=None):
+  """Computes square root of x element-wise.
+
+  I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`, `complex64`, `complex128`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+  """
+  with ops.op_scope([x], name, "Sqrt") as name:
+    if isinstance(x, ops.SparseTensor):
+      x_sqrt = gen_math_ops.sqrt(x.values, name=name)
+      return ops.SparseTensor(indices=x.indices, values=x_sqrt, shape=x.shape)
+    else:
+      return gen_math_ops.sqrt(x, name=name)
+
+
 def complex_abs(x, name=None):
   r"""Computes the complex absolute value of a tensor.
 
diff --git a/tensorflow/python/platform/flags.py b/tensorflow/python/platform/flags.py
index 4b22b3f5d674961a6d3e660bdeb0b9fab235da62..85f9e2cb860d02166e2171e1cebd9e5ebccfc25b 100644
--- a/tensorflow/python/platform/flags.py
+++ b/tensorflow/python/platform/flags.py
@@ -101,9 +101,12 @@ def DEFINE_boolean(flag_name, default_value, docstring):
                               help=docstring,
                               default=default_value,
                               type=str2bool)
+
+  # Add negated version, stay consistent with argparse with regard to
+  # dashes in flag names.
   _global_parser.add_argument('--no' + flag_name,
                               action='store_false',
-                              dest=flag_name)
+                              dest=flag_name.replace('-', '_'))
 
 
 # The internal google library defines the following alias, so we match
diff --git a/tensorflow/python/platform/flags_test.py b/tensorflow/python/platform/flags_test.py
index 473877eb1e684ba94c0a1c0f0c845710920f853a..39d92bd399cda96e6c3657407a0d707d7d87b64b 100644
--- a/tensorflow/python/platform/flags_test.py
+++ b/tensorflow/python/platform/flags_test.py
@@ -31,6 +31,7 @@ flags.DEFINE_float("float_foo", 42.0, "HelpString")
 
 flags.DEFINE_boolean("bool_foo", True, "HelpString")
 flags.DEFINE_boolean("bool_negation", True, "HelpString")
+flags.DEFINE_boolean("bool-dash-negation", True, "HelpString")
 flags.DEFINE_boolean("bool_a", False, "HelpString")
 flags.DEFINE_boolean("bool_c", False, "HelpString")
 flags.DEFINE_boolean("bool_d", True, "HelpString")
@@ -64,6 +65,10 @@ class FlagsTest(googletest.TestCase):
     # --bool_flag=True sets to True
     self.assertEqual(True, FLAGS.bool_c)
 
+    # --no before the flag mirrors argparse's behavior with
+    # regard to dashes in flag names
+    self.assertEqual(False, FLAGS.bool_dash_negation)
+
     # --bool_flag=False sets to False
     self.assertEqual(False, FLAGS.bool_d)
 
@@ -85,9 +90,9 @@ class FlagsTest(googletest.TestCase):
 
 if __name__ == "__main__":
   # Test command lines
-  sys.argv.extend(["--bool_a", "--nobool_negation", "--bool_c=True",
-                   "--bool_d=False", "--bool_e=gibberish", "--unknown_flag",
-                   "and_argument"])
+  sys.argv.extend(["--bool_a", "--nobool_negation", "--nobool-dash-negation",
+                   "--bool_c=True", "--bool_d=False", "--bool_e=gibberish",
+                   "--unknown_flag", "and_argument"])
 
   # googletest.main() tries to interpret the above flags, so use the
   # direct functions instead.
diff --git a/tensorflow/stream_executor/dso_loader.cc b/tensorflow/stream_executor/dso_loader.cc
index fab3385cca2966e52614b65cf863e83c0a2045a6..cce31ef4dcf7123b610c3f724bf8753b5d1625f1 100644
--- a/tensorflow/stream_executor/dso_loader.cc
+++ b/tensorflow/stream_executor/dso_loader.cc
@@ -72,7 +72,7 @@ string GetCudnnVersion() { return ""; }
 }
 
 /* static */ port::Status DsoLoader::GetLibcudaDsoHandle(void** dso_handle) {
-  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName("cuda", ""),
+  return GetDsoHandle(FindDsoPath(tensorflow::internal::FormatLibraryFileName("cuda", "1"),
                                   GetCudaDriverLibraryPath()),
                       dso_handle);
 }
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 59cb647e450586b505a2c5cf7528cefe3d68089d..07b10de08cf8a77ca722f59e9acef070b8e813c9 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -27,7 +27,7 @@ from setuptools import find_packages, setup, Command, Extension
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-_VERSION = '0.9.0'
+_VERSION = '0.8.0'
 
 numpy_version = "1.8.2"
 if platform.system() == "Darwin":
diff --git a/util/python/BUILD b/util/python/BUILD
index a610c292994999ca8a29f883bfac7d039e675677..af05de20044ea0b2c1fd1d9b0de039daf7118f94 100644
--- a/util/python/BUILD
+++ b/util/python/BUILD
@@ -15,6 +15,7 @@ genrule(
     name = "python_check",
     srcs = [
         "python_config.sh",
+        "configure_files"
     ],
     outs = [
         "python_checked",
@@ -22,3 +23,10 @@ genrule(
     cmd = "OUTPUTDIR=\"$(@D)/\"; $(location :python_config.sh) --check && touch $$OUTPUTDIR/python_checked",
     local = 1,
 )
+
+filegroup(
+    name = "configure_files",
+    data = glob([
+      "*",
+    ])
+)