diff --git a/arm/arm_init.c b/arm/arm_init.c
index fb3d50d04ae8dc5dad891905d13127a379494e59..3c012317417630696c4f0233ba413cb30693d2a2 100644
--- a/arm/arm_init.c
+++ b/arm/arm_init.c
@@ -1,9 +1,9 @@
 
 /* arm_init.c - NEON optimised filter functions
  *
- * Copyright (c) 2014 Glenn Randers-Pehrson
+ * Copyright (c) 2014,2016 Glenn Randers-Pehrson
  * Written by Mans Rullgard, 2011.
- * Last changed in libpng 1.6.16 [December 22, 2014]
+ * Last changed in libpng 1.6.22 [(PENDING RELEASE)]
  *
  * This code is released under the libpng license.
  * For conditions of distribution and use, see the disclaimer
@@ -66,6 +66,7 @@ png_init_filter_functions_neon(png_structp pp, unsigned int bpp)
     * wrong order of the 'ON' and 'default' cases.  UNSET now defaults to OFF,
     * as documented in png.h
     */
+   png_debug(1, "in png_init_filter_functions_neon");
 #ifdef PNG_ARM_NEON_API_SUPPORTED
    switch ((pp->options >> PNG_ARM_NEON) & 3)
    {
diff --git a/arm/filter_neon_intrinsics.c b/arm/filter_neon_intrinsics.c
index d42c78890911c2cbf6ac035970dd8233f7421d04..22923b4b4fbb64a249b4e478c94fd3648eb463ed 100644
--- a/arm/filter_neon_intrinsics.c
+++ b/arm/filter_neon_intrinsics.c
@@ -1,11 +1,11 @@
 
 /* filter_neon_intrinsics.c - NEON optimised filter functions
  *
- * Copyright (c) 2014 Glenn Randers-Pehrson
+ * Copyright (c) 2014,2016 Glenn Randers-Pehrson
  * Written by James Yu <james.yu at linaro.org>, October 2013.
  * Based on filter_neon.S, written by Mans Rullgard, 2011.
  *
- * Last changed in libpng 1.6.16 [December 22, 2014]
+ * Last changed in libpng 1.6.22 [(PENDING RELEASE)]
  *
  * This code is released under the libpng license.
  * For conditions of distribution and use, see the disclaimer
@@ -47,6 +47,8 @@ png_read_filter_row_up_neon(png_row_infop row_info, png_bytep row,
    png_bytep rp_stop = row + row_info->rowbytes;
    png_const_bytep pp = prev_row;
 
+   png_debug(1, "in png_read_filter_row_up_neon");
+
    for (; rp < rp_stop; rp += 16, pp += 16)
    {
       uint8x16_t qrp, qpp;
@@ -72,6 +74,8 @@ png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
    uint8x8x4_t vdest;
    vdest.val[3] = vdup_n_u8(0);
 
+   png_debug(1, "in png_read_filter_row_sub3_neon");
+
    for (; rp < rp_stop;)
    {
       uint8x8_t vtmp1, vtmp2;
@@ -113,6 +117,8 @@ png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
    uint8x8x4_t vdest;
    vdest.val[3] = vdup_n_u8(0);
 
+   png_debug(1, "in png_read_filter_row_sub4_neon");
+
    for (; rp < rp_stop; rp += 16)
    {
       uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
@@ -148,6 +154,8 @@ png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
    vrpt = png_ptr(uint8x8x2_t,&vtmp);
    vrp = *vrpt;
 
+   png_debug(1, "in png_read_filter_row_avg3_neon");
+
    for (; rp < rp_stop; pp += 12)
    {
       uint8x8_t vtmp1, vtmp2, vtmp3;
@@ -207,6 +215,8 @@ png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
    uint8x8x4_t vdest;
    vdest.val[3] = vdup_n_u8(0);
 
+   png_debug(1, "in png_read_filter_row_avg4_neon");
+
    for (; rp < rp_stop; rp += 16, pp += 16)
    {
       uint32x2x4_t vtmp;
@@ -280,6 +290,8 @@ png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
    vrpt = png_ptr(uint8x8x2_t,&vtmp);
    vrp = *vrpt;
 
+   png_debug(1, "in png_read_filter_row_paeth3_neon");
+
    for (; rp < rp_stop; pp += 12)
    {
       uint8x8x2_t *vppt;
@@ -339,6 +351,8 @@ png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
    uint8x8x4_t vdest;
    vdest.val[3] = vdup_n_u8(0);
 
+   png_debug(1, "in png_read_filter_row_paeth4_neon");
+
    for (; rp < rp_stop; rp += 16, pp += 16)
    {
       uint32x2x4_t vtmp;
diff --git a/contrib/intel/INSTALL b/contrib/intel/INSTALL
index d13675449ef8e8a7c75e55fa41b2f70cdf0db827..105c953feb0f0e2b39be4752eb1fed9cba3543d4 100644
--- a/contrib/intel/INSTALL
+++ b/contrib/intel/INSTALL
@@ -2,3 +2,7 @@
 To enable SSE support in libpng, manually edit configure.ac and Makefile.am,
 following the instructions in the configure.ac.patch and Makefile.am.patch
 files, then configure with -DPNG_INTEL_SSE in CPPFLAGS.
+
+If you have moved the *.c files to a different directory, be sure to update
+the '#include "../../pngpriv.h"' line in both files if necessary to point
+to the correct relative location of pngpriv.h.
diff --git a/contrib/intel/filter_sse2_intrinsics.c b/contrib/intel/filter_sse2_intrinsics.c
index 7c359b580e92420b49ff301e34a923b029958130..fcd875f6b3fe553972e016d0a2b7d131df279c74 100644
--- a/contrib/intel/filter_sse2_intrinsics.c
+++ b/contrib/intel/filter_sse2_intrinsics.c
@@ -4,7 +4,7 @@
  * Copyright (c) 2016 Google, Inc.
  * Written by Mike Klein and Matt Sarett
  * Derived from arm/filter_neon_intrinsics.c, which was
- * Copyright (c) 2014 Glenn Randers-Pehrson
+ * Copyright (c) 2014,2016 Glenn Randers-Pehrson
  *
  * Last changed in libpng 1.6.22 [(PENDING RELEASE)]
  *
@@ -55,6 +55,7 @@ void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
     * There is no pixel to the left of the first pixel.  It's encoded directly.
     * That works with our main loop if we just say that left pixel was zero.
     */
+   png_debug(1, "in png_read_filter_row_sub3_sse2");
    __m128i a, d = _mm_setzero_si128();
 
    int rb = row_info->rowbytes;
@@ -75,6 +76,7 @@ void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
     * There is no pixel to the left of the first pixel.  It's encoded directly.
     * That works with our main loop if we just say that left pixel was zero.
     */
+   png_debug(1, "in png_read_filter_row_sub4_sse2");
    __m128i a, d = _mm_setzero_si128();
 
    int rb = row_info->rowbytes;
@@ -96,6 +98,7 @@ void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
     * predicted to be half of the pixel above it.  So again, this works
     * perfectly with our loop if we make sure a starts at zero.
     */
+   png_debug(1, "in png_read_filter_row_avg3_sse2");
    const __m128i zero = _mm_setzero_si128();
    __m128i    b;
    __m128i a, d = zero;
@@ -128,6 +131,7 @@ void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
     * predicted to be half of the pixel above it.  So again, this works
     * perfectly with our loop if we make sure a starts at zero.
     */
+   png_debug(1, "in png_read_filter_row_avg4_sse2");
    const __m128i zero = _mm_setzero_si128();
    __m128i    b;
    __m128i a, d = zero;
@@ -196,6 +200,7 @@ void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
     * Here we zero b and d, which become c and a respectively at the start of
     * the loop.
     */
+   png_debug(1, "in png_read_filter_row_paeth3_sse2");
    const __m128i zero = _mm_setzero_si128();
    __m128i c, b = zero,
            a, d = zero;
@@ -254,6 +259,7 @@ void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
     * Here we zero b and d, which become c and a respectively at the start of
     * the loop.
     */
+   png_debug(1, "in png_read_filter_row_paeth4_sse2");
    const __m128i zero = _mm_setzero_si128();
    __m128i c, b = zero,
            a, d = zero;
diff --git a/contrib/intel/intel_init.c b/contrib/intel/intel_init.c
index fc0d9abfdb3fdc78af9087e4b18cb65c6c7a42d4..357e147b9d9062dfd76e96ffd58cd032a6cb4e1c 100644
--- a/contrib/intel/intel_init.c
+++ b/contrib/intel/intel_init.c
@@ -4,7 +4,7 @@
  * Copyright (c) 2016 Google, Inc.
  * Written by Mike Klein and Matt Sarett
  * Derived from arm/arm_init.c, which was
- * Copyright (c) 2014 Glenn Randers-Pehrson
+ * Copyright (c) 2014,2016 Glenn Randers-Pehrson
  *
  * Last changed in libpng 1.6.22 [(PENDING RELEASE)]
  *
@@ -29,6 +29,7 @@ png_init_filter_functions_sse2(png_structp pp, unsigned int bpp)
     * Most of these can be implemented using only MMX and 64-bit registers,
     * but they end up a bit slower than using the equally-ubiquitous SSE2.
    */
+   png_debug(1, "in png_init_filter_functions_sse2");
    if (bpp == 3)
    {
       pp->read_filter[PNG_FILTER_VALUE_SUB-1] = png_read_filter_row_sub3_sse2;