Merge pull request #19658 from alalek:update_libwebp

* 3rdparty: update libwebp 1.1.0 => 1.2.0 - https://github.com/webmproject/libwebp/releases/tag/v1.2.0 * 3rdparty(libwebp): re-apply OpenCV patches

Merge pull request #19658 from alalek:update_libwebp
* 3rdparty: update libwebp 1.1.0 => 1.2.0 - https://github.com/webmproject/libwebp/releases/tag/v1.2.0 * 3rdparty(libwebp): re-apply OpenCV patches
5d499a18 · Alexander Alekhin · GitHub · 19865a26 · 5d499a18 · 5d499a18
33 changed file
--- a/3rdparty/libwebp/src/dec/io_dec.c
+++ b/3rdparty/libwebp/src/dec/io_dec.c
@@ -25,21 +25,16 @@
 static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
  WebPDecBuffer* output = p->output;
  const WebPYUVABuffer* const buf = &output->u.YUVA;
-  uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride;
-  uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride;
-  uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride;
+  uint8_t* const y_dst = buf->y + (size_t)io->mb_y * buf->y_stride;
+  uint8_t* const u_dst = buf->u + (size_t)(io->mb_y >> 1) * buf->u_stride;
+  uint8_t* const v_dst = buf->v + (size_t)(io->mb_y >> 1) * buf->v_stride;
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
  const int uv_w = (mb_w + 1) / 2;
  const int uv_h = (mb_h + 1) / 2;
-  int j;
-  for (j = 0; j < mb_h; ++j) {
-    memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w);
-  }
-  for (j = 0; j < uv_h; ++j) {
-    memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w);
-    memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w);
-  }
+  WebPCopyPlane(io->y, io->y_stride, y_dst, buf->y_stride, mb_w, mb_h);
+  WebPCopyPlane(io->u, io->uv_stride, u_dst, buf->u_stride, uv_w, uv_h);
+  WebPCopyPlane(io->v, io->uv_stride, v_dst, buf->v_stride, uv_w, uv_h);
  return io->mb_h;
 }

@@ -47,7 +42,7 @@ static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
  WebPDecBuffer* const output = p->output;
  WebPRGBABuffer* const buf = &output->u.RGBA;
-  uint8_t* const dst = buf->rgba + io->mb_y * buf->stride;
+  uint8_t* const dst = buf->rgba + (size_t)io->mb_y * buf->stride;
  WebPSamplerProcessPlane(io->y, io->y_stride,
                          io->u, io->v, io->uv_stride,
                          dst, buf->stride, io->mb_w, io->mb_h,
@@ -62,7 +57,7 @@ static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
  int num_lines_out = io->mb_h;   // a priori guess
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + io->mb_y * buf->stride;
+  uint8_t* dst = buf->rgba + (size_t)io->mb_y * buf->stride;
  WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
  const uint8_t* cur_y = io->y;
  const uint8_t* cur_u = io->u;
@@ -133,7 +128,7 @@ static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
  const int mb_w = io->mb_w;
  const int mb_h = io->mb_h;
-  uint8_t* dst = buf->a + io->mb_y * buf->a_stride;
+  uint8_t* dst = buf->a + (size_t)io->mb_y * buf->a_stride;
  int j;
  (void)expected_num_lines_out;
  assert(expected_num_lines_out == mb_h);
@@ -186,7 +181,7 @@ static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
        (colorspace == MODE_ARGB || colorspace == MODE_Argb);
    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
    int num_rows;
-    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
    uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
    const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
@@ -210,7 +205,7 @@ static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
    const WEBP_CSP_MODE colorspace = p->output->colorspace;
    const WebPRGBABuffer* const buf = &p->output->u.RGBA;
    int num_rows;
-    const int start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
+    const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
    uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
    uint8_t* alpha_dst = base_rgba;
@@ -276,9 +271,9 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
                                int expected_num_lines_out) {
  const WebPYUVABuffer* const buf = &p->output->u.YUVA;
-  uint8_t* const dst_a = buf->a + p->last_y * buf->a_stride;
+  uint8_t* const dst_a = buf->a + (size_t)p->last_y * buf->a_stride;
  if (io->a != NULL) {
-    uint8_t* const dst_y = buf->y + p->last_y * buf->y_stride;
+    uint8_t* const dst_y = buf->y + (size_t)p->last_y * buf->y_stride;
    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, p->scaler_a);
    assert(expected_num_lines_out == num_lines_out);
    if (num_lines_out > 0) {   // unmultiply the Y
@@ -356,7 +351,7 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
  const WebPYUV444Converter convert =
      WebPYUV444Converters[p->output->colorspace];
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* dst = buf->rgba + y_pos * buf->stride;
+  uint8_t* dst = buf->rgba + (size_t)y_pos * buf->stride;
  int num_lines_out = 0;
  // For RGB rescaling, because of the YUV420, current scan position
  // U/V can be +1/-1 line from the Y one.  Hence the double test.
@@ -383,15 +378,15 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
  while (j < mb_h) {
    const int y_lines_in =
        WebPRescalerImport(p->scaler_y, mb_h - j,
-                           io->y + j * io->y_stride, io->y_stride);
+                           io->y + (size_t)j * io->y_stride, io->y_stride);
    j += y_lines_in;
    if (WebPRescaleNeededLines(p->scaler_u, uv_mb_h - uv_j)) {
-      const int u_lines_in =
-          WebPRescalerImport(p->scaler_u, uv_mb_h - uv_j,
-                             io->u + uv_j * io->uv_stride, io->uv_stride);
-      const int v_lines_in =
-          WebPRescalerImport(p->scaler_v, uv_mb_h - uv_j,
-                             io->v + uv_j * io->uv_stride, io->uv_stride);
+      const int u_lines_in = WebPRescalerImport(
+          p->scaler_u, uv_mb_h - uv_j, io->u + (size_t)uv_j * io->uv_stride,
+          io->uv_stride);
+      const int v_lines_in = WebPRescalerImport(
+          p->scaler_v, uv_mb_h - uv_j, io->v + (size_t)uv_j * io->uv_stride,
+          io->uv_stride);
      (void)v_lines_in;   // remove a gcc warning
      assert(u_lines_in == v_lines_in);
      uv_j += u_lines_in;
@@ -403,7 +398,7 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {

 static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
  const WEBP_CSP_MODE colorspace = p->output->colorspace;
  const int alpha_first =
      (colorspace == MODE_ARGB || colorspace == MODE_Argb);
@@ -431,7 +426,7 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
 static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
                               int max_lines_out) {
  const WebPRGBABuffer* const buf = &p->output->u.RGBA;
-  uint8_t* const base_rgba = buf->rgba + y_pos * buf->stride;
+  uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
 #if (WEBP_SWAP_16BIT_CSP == 1)
  uint8_t* alpha_dst = base_rgba;
 #else
@@ -470,7 +465,7 @@ static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
    int lines_left = expected_num_out_lines;
    const int y_end = p->last_y + lines_left;
    while (lines_left > 0) {
-      const int row_offset = scaler->src_y - io->mb_y;
+      const int64_t row_offset = (int64_t)scaler->src_y - io->mb_y;
      WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y,
                         io->a + row_offset * io->width, io->width);
      lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left);

--- a/3rdparty/libwebp/src/dec/vp8_dec.c
+++ b/3rdparty/libwebp/src/dec/vp8_dec.c
@@ -494,13 +494,11 @@ static int GetCoeffsAlt(VP8BitReader* const br,
  return 16;
 }

-static WEBP_TSAN_IGNORE_FUNCTION void InitGetCoeffs(void) {
-  if (GetCoeffs == NULL) {
-    if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
-      GetCoeffs = GetCoeffsAlt;
-    } else {
-      GetCoeffs = GetCoeffsFast;
-    }
+WEBP_DSP_INIT_FUNC(InitGetCoeffs) {
+  if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
+    GetCoeffs = GetCoeffsAlt;
+  } else {
+    GetCoeffs = GetCoeffsFast;
  }
 }


--- a/3rdparty/libwebp/src/dec/vp8i_dec.h
+++ b/3rdparty/libwebp/src/dec/vp8i_dec.h
@@ -31,7 +31,7 @@ extern "C" {

 // version numbers
 #define DEC_MAJ_VERSION 1
-#define DEC_MIN_VERSION 1
+#define DEC_MIN_VERSION 2
 #define DEC_REV_VERSION 0

 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).

--- a/3rdparty/libwebp/src/dec/vp8l_dec.c
+++ b/3rdparty/libwebp/src/dec/vp8l_dec.c
@@ -947,7 +947,6 @@ static WEBP_INLINE void CopyBlock8b(uint8_t* const dst, int dist, int length) {
        break;
      default:
        goto Copy;
-        break;
    }
    CopySmallPattern8b(src, dst, length, pattern);
    return;

--- a/3rdparty/libwebp/src/demux/anim_decode.c
+++ b/3rdparty/libwebp/src/demux/anim_decode.c
@@ -346,12 +346,15 @@ int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
  {
    const uint8_t* in = iter.fragment.bytes;
    const size_t in_size = iter.fragment.size;
-    const size_t out_offset =
-        (iter.y_offset * width + iter.x_offset) * NUM_CHANNELS;
+    const uint32_t stride = width * NUM_CHANNELS;  // at most 25 + 2 bits
+    const uint64_t out_offset = (uint64_t)iter.y_offset * stride +
+                                (uint64_t)iter.x_offset * NUM_CHANNELS;  // 53b
+    const uint64_t size = (uint64_t)iter.height * stride;  // at most 25 + 27b
    WebPDecoderConfig* const config = &dec->config_;
    WebPRGBABuffer* const buf = &config->output.u.RGBA;
-    buf->stride = NUM_CHANNELS * width;
-    buf->size = buf->stride * iter.height;
+    if ((size_t)size != size) goto Error;
+    buf->stride = (int)stride;
+    buf->size = (size_t)size;
    buf->rgba = dec->curr_frame_ + out_offset;

    if (WebPDecode(in, in_size, config) != VP8_STATUS_OK) {

--- a/3rdparty/libwebp/src/demux/demux.c
+++ b/3rdparty/libwebp/src/demux/demux.c
@@ -24,7 +24,7 @@
 #include "src/webp/format_constants.h"

 #define DMUX_MAJ_VERSION 1
-#define DMUX_MIN_VERSION 1
+#define DMUX_MIN_VERSION 2
 #define DMUX_REV_VERSION 0

 typedef struct {
@@ -312,6 +312,7 @@ static ParseStatus ParseAnimationFrame(
  int bits;
  MemBuffer* const mem = &dmux->mem_;
  Frame* frame;
+  size_t start_offset;
  ParseStatus status =
      NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
  if (status != PARSE_OK) return status;
@@ -332,7 +333,11 @@ static ParseStatus ParseAnimationFrame(

  // Store a frame only if the animation flag is set there is some data for
  // this frame is available.
+  start_offset = mem->start_;
  status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
+  if (status != PARSE_ERROR && mem->start_ - start_offset > anmf_payload_size) {
+    status = PARSE_ERROR;
+  }
  if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
    added_frame = AddFrame(dmux, frame);
    if (added_frame) {

--- a/3rdparty/libwebp/src/dsp/alpha_processing.c
+++ b/3rdparty/libwebp/src/dsp/alpha_processing.c
@@ -359,6 +359,11 @@ static int HasAlpha32b_C(const uint8_t* src, int length) {
  return 0;
 }

+static void AlphaReplace_C(uint32_t* src, int length, uint32_t color) {
+  int x;
+  for (x = 0; x < length; ++x) if ((src[x] >> 24) == 0) src[x] = color;
+}
+
 //------------------------------------------------------------------------------
 // Simple channel manipulations.

@@ -400,6 +405,7 @@ void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,

 int (*WebPHasAlpha8b)(const uint8_t* src, int length);
 int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);

 //------------------------------------------------------------------------------
 // Init function
@@ -428,6 +434,7 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {

  WebPHasAlpha8b = HasAlpha8b_C;
  WebPHasAlpha32b = HasAlpha32b_C;
+  WebPAlphaReplace = AlphaReplace_C;

  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
  if (VP8GetCPUInfo != NULL) {
@@ -469,4 +476,5 @@ WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
  assert(WebPPackRGB != NULL);
  assert(WebPHasAlpha8b != NULL);
  assert(WebPHasAlpha32b != NULL);
+  assert(WebPAlphaReplace != NULL);
 }
--- a/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
+++ b/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -265,6 +265,27 @@ static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
  return 0;
 }

+static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
+  const __m128i m_color = _mm_set1_epi32(color);
+  const __m128i zero = _mm_setzero_si128();
+  int i = 0;
+  for (; i + 8 <= length; i += 8) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4));
+    const __m128i b0 = _mm_srai_epi32(a0, 24);
+    const __m128i b1 = _mm_srai_epi32(a1, 24);
+    const __m128i c0 = _mm_cmpeq_epi32(b0, zero);
+    const __m128i c1 = _mm_cmpeq_epi32(b1, zero);
+    const __m128i d0 = _mm_and_si128(c0, m_color);
+    const __m128i d1 = _mm_and_si128(c1, m_color);
+    const __m128i e0 = _mm_andnot_si128(c0, a0);
+    const __m128i e1 = _mm_andnot_si128(c1, a1);
+    _mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0));
+    _mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1));
+  }
+  for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color;
+}
+
 // -----------------------------------------------------------------------------
 // Apply alpha value to rows

@@ -334,6 +355,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {

  WebPHasAlpha8b = HasAlpha8b_SSE2;
  WebPHasAlpha32b = HasAlpha32b_SSE2;
+  WebPAlphaReplace = AlphaReplace_SSE2;
 }

 #else  // !WEBP_USE_SSE2

--- a/3rdparty/libwebp/src/dsp/cpu.c
+++ b/3rdparty/libwebp/src/dsp/cpu.c
@@ -55,12 +55,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
    : "a"(info_type), "c"(0));
 }
-#elif (defined(_M_X64) || defined(_M_IX86)) && \
-      defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+
+#if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729  // >= VS2008 SP1
 #include <intrin.h>
 #define GetCPUInfo(info, type) __cpuidex(info, type, 0)  // set ecx=0
-#elif defined(WEBP_MSC_SSE2)
+#define WEBP_HAVE_MSC_CPUID
+#elif _MSC_VER > 1310
+#include <intrin.h>
 #define GetCPUInfo __cpuid
+#define WEBP_HAVE_MSC_CPUID
+#endif
+
 #endif

 // NaCl has no support for xgetbv or the raw opcode.
@@ -94,7 +100,7 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 #define xgetbv() 0U  // no AVX for older x64 or unrecognized toolchains.
 #endif

-#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_HAVE_MSC_CPUID)

 // helper function for run-time detection of slow SSSE3 platforms
 static int CheckSlowModel(int info) {
@@ -179,6 +185,30 @@ static int AndroidCPUInfo(CPUFeature feature) {
  return 0;
 }
 VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
+#elif defined(EMSCRIPTEN) // also needs to be before generic NEON test
+// Use compile flags as an indicator of SIMD support instead of a runtime check.
+static int wasmCPUInfo(CPUFeature feature) {
+  switch (feature) {
+#ifdef WEBP_USE_SSE2
+    case kSSE2:
+      return 1;
+#endif
+#ifdef WEBP_USE_SSE41
+    case kSSE3:
+    case kSlowSSSE3:
+    case kSSE4_1:
+      return 1;
+#endif
+#ifdef WEBP_USE_NEON
+    case kNEON:
+      return 1;
+#endif
+    default:
+      break;
+  }
+  return 0;
+}
+VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
 #elif defined(WEBP_USE_NEON)
 // define a dummy function to enable turning off NEON at runtime by setting
 // VP8DecGetCPUInfo = NULL

--- a/3rdparty/libwebp/src/dsp/dec_neon.c
+++ b/3rdparty/libwebp/src/dsp/dec_neon.c
@@ -1283,12 +1283,12 @@ static void DC4_NEON(uint8_t* dst) {    // DC
  const uint8x8_t A = vld1_u8(dst - BPS);  // top row
  const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
  const uint16x4_t p1 = vpadd_u16(p0, p0);
-  const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
-  const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
-  const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
-  const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
-  const uint16x8_t s0 = vaddq_u16(L0, L1);
-  const uint16x8_t s1 = vaddq_u16(L2, L3);
+  const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+  const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+  const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+  const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+  const uint16x8_t s0 = vaddl_u8(L0, L1);
+  const uint16x8_t s1 = vaddl_u8(L2, L3);
  const uint16x8_t s01 = vaddq_u16(s0, s1);
  const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
  const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
@@ -1429,8 +1429,7 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
  if (do_top) {
    const uint8x8_t A = vld1_u8(dst - BPS);  // top row
 #if defined(__aarch64__)
-    const uint16x8_t B = vmovl_u8(A);
-    const uint16_t p2 = vaddvq_u16(B);
+    const uint16_t p2 = vaddlv_u8(A);
    sum_top = vdupq_n_u16(p2);
 #else
    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
@@ -1441,18 +1440,18 @@ static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
  }

  if (do_left) {
-    const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
-    const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
-    const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
-    const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
-    const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
-    const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
-    const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
-    const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
-    const uint16x8_t s0 = vaddq_u16(L0, L1);
-    const uint16x8_t s1 = vaddq_u16(L2, L3);
-    const uint16x8_t s2 = vaddq_u16(L4, L5);
-    const uint16x8_t s3 = vaddq_u16(L6, L7);
+    const uint8x8_t L0 = vld1_u8(dst + 0 * BPS - 1);
+    const uint8x8_t L1 = vld1_u8(dst + 1 * BPS - 1);
+    const uint8x8_t L2 = vld1_u8(dst + 2 * BPS - 1);
+    const uint8x8_t L3 = vld1_u8(dst + 3 * BPS - 1);
+    const uint8x8_t L4 = vld1_u8(dst + 4 * BPS - 1);
+    const uint8x8_t L5 = vld1_u8(dst + 5 * BPS - 1);
+    const uint8x8_t L6 = vld1_u8(dst + 6 * BPS - 1);
+    const uint8x8_t L7 = vld1_u8(dst + 7 * BPS - 1);
+    const uint16x8_t s0 = vaddl_u8(L0, L1);
+    const uint16x8_t s1 = vaddl_u8(L2, L3);
+    const uint16x8_t s2 = vaddl_u8(L4, L5);
+    const uint16x8_t s3 = vaddl_u8(L6, L7);
    const uint16x8_t s01 = vaddq_u16(s0, s1);
    const uint16x8_t s23 = vaddq_u16(s2, s3);
    sum_left = vaddq_u16(s01, s23);
@@ -1512,29 +1511,34 @@ static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {

  if (do_top) {
    const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
+#if defined(__aarch64__)
+    const uint16_t p3 = vaddlvq_u8(A);
+    sum_top = vdupq_n_u16(p3);
+#else
    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    const uint16x4_t p3 = vpadd_u16(p2, p2);
    sum_top = vcombine_u16(p3, p3);
+#endif
  }

  if (do_left) {
    int i;
    sum_left = vdupq_n_u16(0);
    for (i = 0; i < 16; i += 8) {
-      const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
-      const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
-      const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
-      const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
-      const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
-      const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
-      const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
-      const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
-      const uint16x8_t s0 = vaddq_u16(L0, L1);
-      const uint16x8_t s1 = vaddq_u16(L2, L3);
-      const uint16x8_t s2 = vaddq_u16(L4, L5);
-      const uint16x8_t s3 = vaddq_u16(L6, L7);
+      const uint8x8_t L0 = vld1_u8(dst + (i + 0) * BPS - 1);
+      const uint8x8_t L1 = vld1_u8(dst + (i + 1) * BPS - 1);
+      const uint8x8_t L2 = vld1_u8(dst + (i + 2) * BPS - 1);
+      const uint8x8_t L3 = vld1_u8(dst + (i + 3) * BPS - 1);
+      const uint8x8_t L4 = vld1_u8(dst + (i + 4) * BPS - 1);
+      const uint8x8_t L5 = vld1_u8(dst + (i + 5) * BPS - 1);
+      const uint8x8_t L6 = vld1_u8(dst + (i + 6) * BPS - 1);
+      const uint8x8_t L7 = vld1_u8(dst + (i + 7) * BPS - 1);
+      const uint16x8_t s0 = vaddl_u8(L0, L1);
+      const uint16x8_t s1 = vaddl_u8(L2, L3);
+      const uint16x8_t s2 = vaddl_u8(L4, L5);
+      const uint16x8_t s3 = vaddl_u8(L6, L7);
      const uint16x8_t s01 = vaddq_u16(s0, s1);
      const uint16x8_t s23 = vaddq_u16(s2, s3);
      const uint16x8_t sum = vaddq_u16(s01, s23);

--- a/3rdparty/libwebp/src/dsp/dsp.h
+++ b/3rdparty/libwebp/src/dsp/dsp.h
@@ -51,9 +51,7 @@ extern "C" {
 # define __has_builtin(x) 0
 #endif

-// for now, none of the optimizations below are available in emscripten
-#if !defined(EMSCRIPTEN)
-
+#if !defined(HAVE_CONFIG_H)
 #if defined(_MSC_VER) && _MSC_VER > 1310 && \
    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
@@ -63,6 +61,7 @@ extern "C" {
    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
 #endif
+#endif

 // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
 // files without intrinsics, allowing the corresponding Init() to be called.
@@ -76,6 +75,9 @@ extern "C" {
 #define WEBP_USE_SSE41
 #endif

+#undef WEBP_MSC_SSE41
+#undef WEBP_MSC_SSE2
+
 // The intrinsics currently cause compiler errors with arm-nacl-gcc and the
 // inline assembly would need to be modified for use with Native Client.
 #if (defined(__ARM_NEON__) || \
@@ -110,8 +112,6 @@ extern "C" {
 #define WEBP_USE_MSA
 #endif

-#endif  /* EMSCRIPTEN */
-
 #ifndef WEBP_DSP_OMIT_C_CODE
 #define WEBP_DSP_OMIT_C_CODE 1
 #endif
@@ -193,6 +193,12 @@ extern "C" {
 #endif
 #endif

+// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
+// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
+#if !defined(WEBP_OFFSET_PTR)
+#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
+#endif
+
 // Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
 #if !defined(WEBP_SWAP_16BIT_CSP)
 #define WEBP_SWAP_16BIT_CSP 0
@@ -632,6 +638,8 @@ extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
 extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
 // This function returns true if src[4*i] contains a value different from 0xff.
 extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
+// replaces transparent values in src[] by 'color'.
+extern void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);

 // To be called first before using the above.
 void WebPInitAlphaProcessing(void);

--- a/3rdparty/libwebp/src/dsp/lossless.c
+++ b/3rdparty/libwebp/src/dsp/lossless.c
@@ -107,62 +107,62 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 //------------------------------------------------------------------------------
 // Predictors

-static uint32_t Predictor0_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top) {
  (void)top;
  (void)left;
  return ARGB_BLACK;
 }
-static uint32_t Predictor1_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top) {
  (void)top;
  return left;
 }
-static uint32_t Predictor2_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top) {
  (void)left;
  return top[0];
 }
-static uint32_t Predictor3_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top) {
  (void)left;
  return top[1];
 }
-static uint32_t Predictor4_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top) {
  (void)left;
  return top[-1];
 }
-static uint32_t Predictor5_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average3(left, top[0], top[1]);
  return pred;
 }
-static uint32_t Predictor6_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(left, top[-1]);
  return pred;
 }
-static uint32_t Predictor7_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(left, top[0]);
  return pred;
 }
-static uint32_t Predictor8_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(top[-1], top[0]);
  (void)left;
  return pred;
 }
-static uint32_t Predictor9_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average2(top[0], top[1]);
  (void)left;
  return pred;
 }
-static uint32_t Predictor10_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
  return pred;
 }
-static uint32_t Predictor11_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = Select(top[0], left, top[-1]);
  return pred;
 }
-static uint32_t Predictor12_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
  return pred;
 }
-static uint32_t Predictor13_C(uint32_t left, const uint32_t* const top) {
+uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top) {
  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
  return pred;
 }
@@ -182,18 +182,18 @@ static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
    out[i] = left = VP8LAddPixels(in[i], left);
  }
 }
-GENERATE_PREDICTOR_ADD(Predictor2_C, PredictorAdd2_C)
-GENERATE_PREDICTOR_ADD(Predictor3_C, PredictorAdd3_C)
-GENERATE_PREDICTOR_ADD(Predictor4_C, PredictorAdd4_C)
-GENERATE_PREDICTOR_ADD(Predictor5_C, PredictorAdd5_C)
-GENERATE_PREDICTOR_ADD(Predictor6_C, PredictorAdd6_C)
-GENERATE_PREDICTOR_ADD(Predictor7_C, PredictorAdd7_C)
-GENERATE_PREDICTOR_ADD(Predictor8_C, PredictorAdd8_C)
-GENERATE_PREDICTOR_ADD(Predictor9_C, PredictorAdd9_C)
-GENERATE_PREDICTOR_ADD(Predictor10_C, PredictorAdd10_C)
-GENERATE_PREDICTOR_ADD(Predictor11_C, PredictorAdd11_C)
-GENERATE_PREDICTOR_ADD(Predictor12_C, PredictorAdd12_C)
-GENERATE_PREDICTOR_ADD(Predictor13_C, PredictorAdd13_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor2_C, PredictorAdd2_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor3_C, PredictorAdd3_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor4_C, PredictorAdd4_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor5_C, PredictorAdd5_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor6_C, PredictorAdd6_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor7_C, PredictorAdd7_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor8_C, PredictorAdd8_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor9_C, PredictorAdd9_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor10_C, PredictorAdd10_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor11_C, PredictorAdd11_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor12_C, PredictorAdd12_C)
+GENERATE_PREDICTOR_ADD(VP8LPredictor13_C, PredictorAdd13_C)

 //------------------------------------------------------------------------------

@@ -562,7 +562,6 @@ VP8LPredictorFunc VP8LPredictors[16];

 // exposed plain-C implementations
 VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
-VP8LPredictorFunc VP8LPredictors_C[16];

 VP8LTransformColorInverseFunc VP8LTransformColorInverse;

@@ -600,8 +599,7 @@ extern void VP8LDspInitMSA(void);
 } while (0);

 WEBP_DSP_INIT_FUNC(VP8LDspInit) {
-  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
-  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
+  COPY_PREDICTOR_ARRAY(VP8LPredictor, VP8LPredictors)
  COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
  COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)


--- a/3rdparty/libwebp/src/dsp/lossless.h
+++ b/3rdparty/libwebp/src/dsp/lossless.h
@@ -30,7 +30,22 @@ extern "C" {

 typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
-extern VP8LPredictorFunc VP8LPredictors_C[16];
+
+uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top);
+uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top);
+
 // These Add/Sub function expects upper[-1] and out[-1] to be readable.
 typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
                                        const uint32_t* upper, int num_pixels,

--- a/3rdparty/libwebp/src/dsp/lossless_common.h
+++ b/3rdparty/libwebp/src/dsp/lossless_common.h
@@ -184,19 +184,6 @@ static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
  }                                                                  \
 }

-// It subtracts the prediction from the input pixel and stores the residual
-// in the output pixel.
-#define GENERATE_PREDICTOR_SUB(PREDICTOR, PREDICTOR_SUB)             \
-static void PREDICTOR_SUB(const uint32_t* in, const uint32_t* upper, \
-                          int num_pixels, uint32_t* out) {           \
-  int x;                                                             \
-  assert(upper != NULL);                                             \
-  for (x = 0; x < num_pixels; ++x) {                                 \
-    const uint32_t pred = (PREDICTOR)(in[x - 1], upper + x);         \
-    out[x] = VP8LSubPixels(in[x], pred);                             \
-  }                                                                  \
-}
-
 #ifdef __cplusplus
 }    // extern "C"
 #endif

--- a/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -702,140 +702,6 @@ void VP8LHistogramAdd(const VP8LHistogram* const a,
 //------------------------------------------------------------------------------
 // Image transforms.

-static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
-  return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
-}
-
-static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
-  return Average2(Average2(a0, a2), a1);
-}
-
-static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
-                                     uint32_t a2, uint32_t a3) {
-  return Average2(Average2(a0, a1), Average2(a2, a3));
-}
-
-static WEBP_INLINE uint32_t Clip255(uint32_t a) {
-  if (a < 256) {
-    return a;
-  }
-  // return 0, when a is a negative integer.
-  // return 255, when a is positive.
-  return ~a >> 24;
-}
-
-static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
-  return Clip255(a + b - c);
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
-  const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
-  const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
-                                         (c1 >> 16) & 0xff,
-                                         (c2 >> 16) & 0xff);
-  const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
-                                         (c1 >> 8) & 0xff,
-                                         (c2 >> 8) & 0xff);
-  const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
-  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
-}
-
-static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
-  return Clip255(a + (a - b) / 2);
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
-  const uint32_t ave = Average2(c0, c1);
-  const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
-  const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
-  const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
-  const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
-  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
-}
-
-// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
-#if defined(__arm__) && \
-    (LOCAL_GCC_VERSION == 0x409 || LOCAL_GCC_VERSION == 0x408)
-# define LOCAL_INLINE __attribute__ ((noinline))
-#else
-# define LOCAL_INLINE WEBP_INLINE
-#endif
-
-static LOCAL_INLINE int Sub3(int a, int b, int c) {
-  const int pb = b - c;
-  const int pa = a - c;
-  return abs(pb) - abs(pa);
-}
-
-#undef LOCAL_INLINE
-
-static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
-  const int pa_minus_pb =
-      Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
-      Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
-      Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
-      Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
-  return (pa_minus_pb <= 0) ? a : b;
-}
-
-//------------------------------------------------------------------------------
-// Predictors
-
-static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
-  (void)left;
-  return top[0];
-}
-static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
-  (void)left;
-  return top[1];
-}
-static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
-  (void)left;
-  return top[-1];
-}
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average3(left, top[0], top[1]);
-  return pred;
-}
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[-1]);
-  return pred;
-}
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(left, top[0]);
-  return pred;
-}
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(top[-1], top[0]);
-  (void)left;
-  return pred;
-}
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average2(top[0], top[1]);
-  (void)left;
-  return pred;
-}
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
-  return pred;
-}
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
-  return pred;
-}
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
-  return pred;
-}
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
-  return pred;
-}
-
-//------------------------------------------------------------------------------
-
 static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
                            int num_pixels, uint32_t* out) {
  int i;
@@ -850,18 +716,33 @@ static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
  (void)upper;
 }

-GENERATE_PREDICTOR_SUB(Predictor2, PredictorSub2_C)
-GENERATE_PREDICTOR_SUB(Predictor3, PredictorSub3_C)
-GENERATE_PREDICTOR_SUB(Predictor4, PredictorSub4_C)
-GENERATE_PREDICTOR_SUB(Predictor5, PredictorSub5_C)
-GENERATE_PREDICTOR_SUB(Predictor6, PredictorSub6_C)
-GENERATE_PREDICTOR_SUB(Predictor7, PredictorSub7_C)
-GENERATE_PREDICTOR_SUB(Predictor8, PredictorSub8_C)
-GENERATE_PREDICTOR_SUB(Predictor9, PredictorSub9_C)
-GENERATE_PREDICTOR_SUB(Predictor10, PredictorSub10_C)
-GENERATE_PREDICTOR_SUB(Predictor11, PredictorSub11_C)
-GENERATE_PREDICTOR_SUB(Predictor12, PredictorSub12_C)
-GENERATE_PREDICTOR_SUB(Predictor13, PredictorSub13_C)
+// It subtracts the prediction from the input pixel and stores the residual
+// in the output pixel.
+#define GENERATE_PREDICTOR_SUB(PREDICTOR_I)                                \
+static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in,              \
+                                          const uint32_t* upper,           \
+                                          int num_pixels, uint32_t* out) { \
+  int x;                                                                   \
+  assert(upper != NULL);                                                   \
+  for (x = 0; x < num_pixels; ++x) {                                       \
+    const uint32_t pred =                                                  \
+        VP8LPredictor##PREDICTOR_I##_C(in[x - 1], upper + x);              \
+    out[x] = VP8LSubPixels(in[x], pred);                                   \
+  }                                                                        \
+}
+
+GENERATE_PREDICTOR_SUB(2)
+GENERATE_PREDICTOR_SUB(3)
+GENERATE_PREDICTOR_SUB(4)
+GENERATE_PREDICTOR_SUB(5)
+GENERATE_PREDICTOR_SUB(6)
+GENERATE_PREDICTOR_SUB(7)
+GENERATE_PREDICTOR_SUB(8)
+GENERATE_PREDICTOR_SUB(9)
+GENERATE_PREDICTOR_SUB(10)
+GENERATE_PREDICTOR_SUB(11)
+GENERATE_PREDICTOR_SUB(12)
+GENERATE_PREDICTOR_SUB(13)

 //------------------------------------------------------------------------------


--- a/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -249,6 +249,7 @@ static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
    }                                  \
  } while (0)

+#if !(defined(__i386__) || defined(_M_IX86))
 static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
  int i;
  double retval = 0.;
@@ -300,6 +301,8 @@ static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
  return (float)retval;
 }
+#endif  // !(defined(__i386__) || defined(_M_IX86))
+
 #undef ANALYZE_X_OR_Y
 #undef ANALYZE_XY

@@ -460,20 +463,22 @@ static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
  (void)upper;
 }

-#define GENERATE_PREDICTOR_1(X, IN)                                           \
-static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
-                                   int num_pixels, uint32_t* out) {           \
-  int i;                                                                      \
-  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
-    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
-    const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN));              \
-    const __m128i res = _mm_sub_epi8(src, pred);                              \
-    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
-  }                                                                           \
-  if (i != num_pixels) {                                                      \
-    VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
-  }                                                                           \
-}
+#define GENERATE_PREDICTOR_1(X, IN)                                         \
+  static void PredictorSub##X##_SSE2(const uint32_t* const in,              \
+                                     const uint32_t* const upper,           \
+                                     int num_pixels, uint32_t* const out) { \
+    int i;                                                                  \
+    for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
+      const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);          \
+      const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN));          \
+      const __m128i res = _mm_sub_epi8(src, pred);                          \
+      _mm_storeu_si128((__m128i*)&out[i], res);                             \
+    }                                                                       \
+    if (i != num_pixels) {                                                  \
+      VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i),           \
+                               num_pixels - i, out + i);                    \
+    }                                                                       \
+  }

 GENERATE_PREDICTOR_1(1, in[i - 1])       // Predictor1: L
 GENERATE_PREDICTOR_1(2, upper[i])        // Predictor2: T
@@ -657,7 +662,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
  VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
  VP8LAddVector = AddVector_SSE2;
  VP8LAddVectorEq = AddVectorEq_SSE2;
+  // TODO(https://crbug.com/webp/499): this function produces different results
+  // from the C code due to use of double/float resulting in output differences
+  // when compared to -noasm.
+#if !(defined(__i386__) || defined(_M_IX86))
  VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
+#endif
  VP8LVectorMismatch = VectorMismatch_SSE2;
  VP8LBundleColorMap = BundleColorMap_SSE2;


--- a/3rdparty/libwebp/src/enc/analysis_enc.c
+++ b/3rdparty/libwebp/src/enc/analysis_enc.c
@@ -126,16 +126,6 @@ static void InitHistogram(VP8Histogram* const histo) {
  histo->last_non_zero = 1;
 }

-static void MergeHistograms(const VP8Histogram* const in,
-                            VP8Histogram* const out) {
-  if (in->max_value > out->max_value) {
-    out->max_value = in->max_value;
-  }
-  if (in->last_non_zero > out->last_non_zero) {
-    out->last_non_zero = in->last_non_zero;
-  }
-}
-
 //------------------------------------------------------------------------------
 // Simplified k-Means, to assign Nb segments based on alpha-histogram

@@ -285,49 +275,6 @@ static int FastMBAnalyze(VP8EncIterator* const it) {
  return 0;
 }

-static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
-                                   int best_alpha) {
-  uint8_t modes[16];
-  const int max_mode = MAX_INTRA4_MODE;
-  int i4_alpha;
-  VP8Histogram total_histo;
-  int cur_histo = 0;
-  InitHistogram(&total_histo);
-
-  VP8IteratorStartI4(it);
-  do {
-    int mode;
-    int best_mode_alpha = DEFAULT_ALPHA;
-    VP8Histogram histos[2];
-    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
-
-    VP8MakeIntra4Preds(it);
-    for (mode = 0; mode < max_mode; ++mode) {
-      int alpha;
-
-      InitHistogram(&histos[cur_histo]);
-      VP8CollectHistogram(src, it->yuv_p_ + VP8I4ModeOffsets[mode],
-                          0, 1, &histos[cur_histo]);
-      alpha = GetAlpha(&histos[cur_histo]);
-      if (IS_BETTER_ALPHA(alpha, best_mode_alpha)) {
-        best_mode_alpha = alpha;
-        modes[it->i4_] = mode;
-        cur_histo ^= 1;   // keep track of best histo so far.
-      }
-    }
-    // accumulate best histogram
-    MergeHistograms(&histos[cur_histo ^ 1], &total_histo);
-    // Note: we reuse the original samples for predictors
-  } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF_ENC));
-
-  i4_alpha = GetAlpha(&total_histo);
-  if (IS_BETTER_ALPHA(i4_alpha, best_alpha)) {
-    VP8SetIntra4Mode(it, modes);
-    best_alpha = i4_alpha;
-  }
-  return best_alpha;
-}
-
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
  int best_alpha = DEFAULT_ALPHA;
  int smallest_alpha = 0;
@@ -371,13 +318,6 @@ static void MBAnalyze(VP8EncIterator* const it,
    best_alpha = FastMBAnalyze(it);
  } else {
    best_alpha = MBAnalyzeBestIntra16Mode(it);
-    if (enc->method_ >= 5) {
-      // We go and make a fast decision for intra4/intra16.
-      // It's usually not a good and definitive pick, but helps seeding the
-      // stats about level bit-cost.
-      // TODO(skal): improve criterion.
-      best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha);
-    }
  }
  best_uv_alpha = MBAnalyzeBestUVMode(it);


--- a/3rdparty/libwebp/src/enc/backward_references_enc.c
+++ b/3rdparty/libwebp/src/enc/backward_references_enc.c
@@ -11,13 +11,14 @@
 //

 #include <assert.h>
+#include <float.h>
 #include <math.h>

-#include "src/enc/backward_references_enc.h"
-#include "src/enc/histogram_enc.h"
+#include "src/dsp/dsp.h"
 #include "src/dsp/lossless.h"
 #include "src/dsp/lossless_common.h"
-#include "src/dsp/dsp.h"
+#include "src/enc/backward_references_enc.h"
+#include "src/enc/histogram_enc.h"
 #include "src/utils/color_cache_utils.h"
 #include "src/utils/utils.h"

@@ -103,6 +104,20 @@ void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
  }
 }

+// Swaps the content of two VP8LBackwardRefs.
+static void BackwardRefsSwap(VP8LBackwardRefs* const refs1,
+                             VP8LBackwardRefs* const refs2) {
+  const int point_to_refs1 =
+      (refs1->tail_ != NULL && refs1->tail_ == &refs1->refs_);
+  const int point_to_refs2 =
+      (refs2->tail_ != NULL && refs2->tail_ == &refs2->refs_);
+  const VP8LBackwardRefs tmp = *refs1;
+  *refs1 = *refs2;
+  *refs2 = tmp;
+  if (point_to_refs2) refs1->tail_ = &refs1->refs_;
+  if (point_to_refs1) refs2->tail_ = &refs2->refs_;
+}
+
 void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
  assert(refs != NULL);
  memset(refs, 0, sizeof(*refs));
@@ -154,6 +169,22 @@ static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
  return b;
 }

+// Return 1 on success, 0 on error.
+static int BackwardRefsClone(const VP8LBackwardRefs* const from,
+                             VP8LBackwardRefs* const to) {
+  const PixOrCopyBlock* block_from = from->refs_;
+  VP8LClearBackwardRefs(to);
+  while (block_from != NULL) {
+    PixOrCopyBlock* const block_to = BackwardRefsNewBlock(to);
+    if (block_to == NULL) return 0;
+    memcpy(block_to->start_, block_from->start_,
+           block_from->size_ * sizeof(PixOrCopy));
+    block_to->size_ = block_from->size_;
+    block_from = block_from->next_;
+  }
+  return 1;
+}
+
 extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
                                      const PixOrCopy v);
 void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
@@ -753,12 +784,18 @@ static int CalculateBestCacheSize(const uint32_t* argb, int quality,
        }
      }
    } else {
+      int code, extra_bits, extra_bits_value;
      // We should compute the contribution of the (distance,length)
      // histograms but those are the same independently from the cache size.
      // As those constant contributions are in the end added to the other
-      // histogram contributions, we can safely ignore them.
+      // histogram contributions, we can ignore them, except for the length
+      // prefix that is part of the literal_ histogram.
      int len = PixOrCopyLength(v);
      uint32_t argb_prev = *argb ^ 0xffffffffu;
+      VP8LPrefixEncode(len, &code, &extra_bits, &extra_bits_value);
+      for (i = 0; i <= cache_bits_max; ++i) {
+        ++histos[i]->literal_[NUM_LITERAL_CODES + code];
+      }
      // Update the color caches.
      do {
        if (*argb != argb_prev) {
@@ -842,16 +879,21 @@ extern int VP8LBackwardReferencesTraceBackwards(
    int xsize, int ysize, const uint32_t* const argb, int cache_bits,
    const VP8LHashChain* const hash_chain,
    const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
-static VP8LBackwardRefs* GetBackwardReferences(
-    int width, int height, const uint32_t* const argb, int quality,
-    int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* best,
-    VP8LBackwardRefs* worst) {
-  const int cache_bits_initial = *cache_bits;
-  double bit_cost_best = -1;
+static int GetBackwardReferences(int width, int height,
+                                 const uint32_t* const argb, int quality,
+                                 int lz77_types_to_try, int cache_bits_max,
+                                 int do_no_cache,
+                                 const VP8LHashChain* const hash_chain,
+                                 VP8LBackwardRefs* const refs,
+                                 int* const cache_bits_best) {
  VP8LHistogram* histo = NULL;
-  int lz77_type, lz77_type_best = 0;
+  int i, lz77_type;
+  // Index 0 is for a color cache, index 1 for no cache (if needed).
+  int lz77_types_best[2] = {0, 0};
+  double bit_costs_best[2] = {DBL_MAX, DBL_MAX};
  VP8LHashChain hash_chain_box;
+  VP8LBackwardRefs* const refs_tmp = &refs[do_no_cache ? 2 : 1];
+  int status = 0;
  memset(&hash_chain_box, 0, sizeof(hash_chain_box));

  histo = VP8LAllocateHistogram(MAX_COLOR_CACHE_BITS);
@@ -860,86 +902,129 @@ static VP8LBackwardRefs* GetBackwardReferences(
  for (lz77_type = 1; lz77_types_to_try;
       lz77_types_to_try &= ~lz77_type, lz77_type <<= 1) {
    int res = 0;
-    double bit_cost;
-    int cache_bits_tmp = cache_bits_initial;
+    double bit_cost = 0.;
    if ((lz77_types_to_try & lz77_type) == 0) continue;
    switch (lz77_type) {
      case kLZ77RLE:
-        res = BackwardReferencesRle(width, height, argb, 0, worst);
+        res = BackwardReferencesRle(width, height, argb, 0, refs_tmp);
        break;
      case kLZ77Standard:
        // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color
        // cache is not that different in practice.
-        res = BackwardReferencesLz77(width, height, argb, 0, hash_chain, worst);
+        res = BackwardReferencesLz77(width, height, argb, 0, hash_chain,
+                                     refs_tmp);
        break;
      case kLZ77Box:
        if (!VP8LHashChainInit(&hash_chain_box, width * height)) goto Error;
        res = BackwardReferencesLz77Box(width, height, argb, 0, hash_chain,
-                                        &hash_chain_box, worst);
+                                        &hash_chain_box, refs_tmp);
        break;
      default:
        assert(0);
    }
    if (!res) goto Error;

-    // Next, try with a color cache and update the references.
-    if (!CalculateBestCacheSize(argb, quality, worst, &cache_bits_tmp)) {
-      goto Error;
-    }
-    if (cache_bits_tmp > 0) {
-      if (!BackwardRefsWithLocalCache(argb, cache_bits_tmp, worst)) {
-        goto Error;
+    // Start with the no color cache case.
+    for (i = 1; i >= 0; --i) {
+      int cache_bits = (i == 1) ? 0 : cache_bits_max;
+
+      if (i == 1 && !do_no_cache) continue;
+
+      if (i == 0) {
+        // Try with a color cache.
+        if (!CalculateBestCacheSize(argb, quality, refs_tmp, &cache_bits)) {
+          goto Error;
+        }
+        if (cache_bits > 0) {
+          if (!BackwardRefsWithLocalCache(argb, cache_bits, refs_tmp)) {
+            goto Error;
+          }
+        }
      }
-    }

-    // Keep the best backward references.
-    VP8LHistogramCreate(histo, worst, cache_bits_tmp);
-    bit_cost = VP8LHistogramEstimateBits(histo);
-    if (lz77_type_best == 0 || bit_cost < bit_cost_best) {
-      VP8LBackwardRefs* const tmp = worst;
-      worst = best;
-      best = tmp;
-      bit_cost_best = bit_cost;
-      *cache_bits = cache_bits_tmp;
-      lz77_type_best = lz77_type;
+      if (i == 0 && do_no_cache && cache_bits == 0) {
+        // No need to re-compute bit_cost as it was computed at i == 1.
+      } else {
+        VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+        bit_cost = VP8LHistogramEstimateBits(histo);
+      }
+
+      if (bit_cost < bit_costs_best[i]) {
+        if (i == 1) {
+          // Do not swap as the full cache analysis would have the wrong
+          // VP8LBackwardRefs to start with.
+          if (!BackwardRefsClone(refs_tmp, &refs[1])) goto Error;
+        } else {
+          BackwardRefsSwap(refs_tmp, &refs[0]);
+        }
+        bit_costs_best[i] = bit_cost;
+        lz77_types_best[i] = lz77_type;
+        if (i == 0) *cache_bits_best = cache_bits;
+      }
    }
  }
-  assert(lz77_type_best > 0);
+  assert(lz77_types_best[0] > 0);
+  assert(!do_no_cache || lz77_types_best[1] > 0);

  // Improve on simple LZ77 but only for high quality (TraceBackwards is
  // costly).
-  if ((lz77_type_best == kLZ77Standard || lz77_type_best == kLZ77Box) &&
-      quality >= 25) {
-    const VP8LHashChain* const hash_chain_tmp =
-        (lz77_type_best == kLZ77Standard) ? hash_chain : &hash_chain_box;
-    if (VP8LBackwardReferencesTraceBackwards(width, height, argb, *cache_bits,
-                                             hash_chain_tmp, best, worst)) {
-      double bit_cost_trace;
-      VP8LHistogramCreate(histo, worst, *cache_bits);
-      bit_cost_trace = VP8LHistogramEstimateBits(histo);
-      if (bit_cost_trace < bit_cost_best) best = worst;
+  for (i = 1; i >= 0; --i) {
+    if (i == 1 && !do_no_cache) continue;
+    if ((lz77_types_best[i] == kLZ77Standard ||
+         lz77_types_best[i] == kLZ77Box) &&
+        quality >= 25) {
+      const VP8LHashChain* const hash_chain_tmp =
+          (lz77_types_best[i] == kLZ77Standard) ? hash_chain : &hash_chain_box;
+      const int cache_bits = (i == 1) ? 0 : *cache_bits_best;
+      if (VP8LBackwardReferencesTraceBackwards(width, height, argb, cache_bits,
+                                               hash_chain_tmp, &refs[i],
+                                               refs_tmp)) {
+        double bit_cost_trace;
+        VP8LHistogramCreate(histo, refs_tmp, cache_bits);
+        bit_cost_trace = VP8LHistogramEstimateBits(histo);
+        if (bit_cost_trace < bit_costs_best[i]) {
+          BackwardRefsSwap(refs_tmp, &refs[i]);
+        }
+      }
    }
-  }

-  BackwardReferences2DLocality(width, best);
+    BackwardReferences2DLocality(width, &refs[i]);
+
+    if (i == 1 && lz77_types_best[0] == lz77_types_best[1] &&
+        *cache_bits_best == 0) {
+      // If the best cache size is 0 and we have the same best LZ77, just copy
+      // the data over and stop here.
+      if (!BackwardRefsClone(&refs[1], &refs[0])) goto Error;
+      break;
+    }
+  }
+  status = 1;

 Error:
  VP8LHashChainClear(&hash_chain_box);
  VP8LFreeHistogram(histo);
-  return best;
+  return status;
 }

-VP8LBackwardRefs* VP8LGetBackwardReferences(
+WebPEncodingError VP8LGetBackwardReferences(
    int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
-    VP8LBackwardRefs* const refs_tmp2) {
+    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+    int* const cache_bits_best) {
  if (low_effort) {
-    return GetBackwardReferencesLowEffort(width, height, argb, cache_bits,
-                                          hash_chain, refs_tmp1);
+    VP8LBackwardRefs* refs_best;
+    *cache_bits_best = cache_bits_max;
+    refs_best = GetBackwardReferencesLowEffort(
+        width, height, argb, cache_bits_best, hash_chain, refs);
+    if (refs_best == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+    // Set it in first position.
+    BackwardRefsSwap(refs_best, &refs[0]);
  } else {
-    return GetBackwardReferences(width, height, argb, quality,
-                                 lz77_types_to_try, cache_bits, hash_chain,
-                                 refs_tmp1, refs_tmp2);
+    if (!GetBackwardReferences(width, height, argb, quality, lz77_types_to_try,
+                               cache_bits_max, do_no_cache, hash_chain, refs,
+                               cache_bits_best)) {
+      return VP8_ENC_ERROR_OUT_OF_MEMORY;
+    }
  }
+  return VP8_ENC_OK;
 }
--- a/3rdparty/libwebp/src/enc/backward_references_enc.h
+++ b/3rdparty/libwebp/src/enc/backward_references_enc.h
@@ -16,6 +16,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include "src/webp/types.h"
+#include "src/webp/encode.h"
 #include "src/webp/format_constants.h"

 #ifdef __cplusplus
@@ -218,14 +219,19 @@ enum VP8LLZ77Type {
 // Evaluates best possible backward references for specified quality.
 // The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
 // bits to use (passing 0 implies disabling the local color cache).
-// The optimal cache bits is evaluated and set for the *cache_bits parameter.
-// The return value is the pointer to the best of the two backward refs viz,
-// refs[0] or refs[1].
-VP8LBackwardRefs* VP8LGetBackwardReferences(
+// The optimal cache bits is evaluated and set for the *cache_bits_best
+// parameter with the matching refs_best.
+// If do_no_cache == 0, refs is an array of 2 values and the best
+// VP8LBackwardRefs is put in the first element.
+// If do_no_cache != 0, refs is an array of 3 values and the best
+// VP8LBackwardRefs is put in the first element, the best value with no-cache in
+// the second element.
+// In both cases, the last element is used as temporary internally.
+WebPEncodingError VP8LGetBackwardReferences(
    int width, int height, const uint32_t* const argb, int quality,
-    int low_effort, int lz77_types_to_try, int* const cache_bits,
-    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs_tmp1,
-    VP8LBackwardRefs* const refs_tmp2);
+    int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
+    const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
+    int* const cache_bits_best);

 #ifdef __cplusplus
 }

--- a/3rdparty/libwebp/src/enc/config_enc.c
+++ b/3rdparty/libwebp/src/enc/config_enc.c
@@ -39,6 +39,8 @@ int WebPConfigInitInternal(WebPConfig* config,
  config->partitions = 0;
  config->segments = 4;
  config->pass = 1;
+  config->qmin = 0;
+  config->qmax = 100;
  config->show_compressed = 0;
  config->preprocessing = 0;
  config->autofilter = 0;
@@ -106,6 +108,9 @@ int WebPValidateConfig(const WebPConfig* config) {
  if (config->filter_type < 0 || config->filter_type > 1) return 0;
  if (config->autofilter < 0 || config->autofilter > 1) return 0;
  if (config->pass < 1 || config->pass > 10) return 0;
+  if (config->qmin < 0 || config->qmax > 100 || config->qmin > config->qmax) {
+    return 0;
+  }
  if (config->show_compressed < 0 || config->show_compressed > 1) return 0;
  if (config->preprocessing < 0 || config->preprocessing > 7) return 0;
  if (config->partitions < 0 || config->partitions > 3) return 0;

--- a/3rdparty/libwebp/src/enc/frame_enc.c
+++ b/3rdparty/libwebp/src/enc/frame_enc.c
@@ -31,10 +31,15 @@
 // we allow 2k of extra head-room in PARTITION0 limit.
 #define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)

+static float Clamp(float v, float min, float max) {
+  return (v < min) ? min : (v > max) ? max : v;
+}
+
 typedef struct {  // struct for organizing convergence in either size or PSNR
  int is_first;
  float dq;
  float q, last_q;
+  float qmin, qmax;
  double value, last_value;   // PSNR or size
  double target;
  int do_size_search;
@@ -47,7 +52,9 @@ static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {

  s->is_first = 1;
  s->dq = 10.f;
-  s->q = s->last_q = enc->config_->quality;
+  s->qmin = 1.f * enc->config_->qmin;
+  s->qmax = 1.f * enc->config_->qmax;
+  s->q = s->last_q = Clamp(enc->config_->quality, s->qmin, s->qmax);
  s->target = do_size_search ? (double)target_size
            : (target_PSNR > 0.) ? target_PSNR
            : 40.;   // default, just in case
@@ -56,10 +63,6 @@ static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
  return do_size_search;
 }

-static float Clamp(float v, float min, float max) {
-  return (v < min) ? min : (v > max) ? max : v;
-}
-
 static float ComputeNextQ(PassStats* const s) {
  float dq;
  if (s->is_first) {
@@ -75,7 +78,7 @@ static float ComputeNextQ(PassStats* const s) {
  s->dq = Clamp(dq, -30.f, 30.f);
  s->last_q = s->q;
  s->last_value = s->value;
-  s->q = Clamp(s->q + s->dq, 0.f, 100.f);
+  s->q = Clamp(s->q + s->dq, s->qmin, s->qmax);
  return s->q;
 }

@@ -848,9 +851,10 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
    }

 #if (DEBUG_SEARCH > 0)
-    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf\n",
+    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf "
+           " range:[%.1f, %.1f]\n",
           num_pass_left, stats.last_value, stats.value,
-           stats.last_q, stats.q, stats.dq);
+           stats.last_q, stats.q, stats.dq, stats.qmin, stats.qmax);
 #endif
    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
      ++num_pass_left;

--- a/3rdparty/libwebp/src/enc/histogram_enc.c
+++ b/3rdparty/libwebp/src/enc/histogram_enc.c
@@ -208,6 +208,7 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
  } else if (PixOrCopyIsCacheIdx(v)) {
    const int literal_ix =
        NUM_LITERAL_CODES + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
+    assert(histo->palette_code_bits_ != 0);
    ++histo->literal_[literal_ix];
  } else {
    int code, extra_bits;

--- a/3rdparty/libwebp/src/enc/picture_csp_enc.c
+++ b/3rdparty/libwebp/src/enc/picture_csp_enc.c
@@ -61,16 +61,14 @@ static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
 // Checking for the presence of non-opaque alpha.
 int WebPPictureHasTransparency(const WebPPicture* picture) {
  if (picture == NULL) return 0;
-  if (!picture->use_argb) {
-    return CheckNonOpaque(picture->a, picture->width, picture->height,
-                          1, picture->a_stride);
-  } else {
+  if (picture->use_argb) {
    const int alpha_offset = ALPHA_OFFSET;
    return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
                          picture->width, picture->height,
                          4, picture->argb_stride * sizeof(*picture->argb));
  }
-  return 0;
+  return CheckNonOpaque(picture->a, picture->width, picture->height,
+                        1, picture->a_stride);
 }

 //------------------------------------------------------------------------------
@@ -90,8 +88,9 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 static int kLinearToGammaTab[kGammaTabSize + 1];
 static uint16_t kGammaToLinearTab[256];
 static volatile int kGammaTablesOk = 0;
+static void InitGammaTables(void);

-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {
+WEBP_DSP_INIT_FUNC(InitGammaTables) {
  if (!kGammaTablesOk) {
    int v;
    const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
@@ -181,8 +180,9 @@ static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
 #define GAMMA_TO_LINEAR_BITS 14
 static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
 static volatile int kGammaTablesSOk = 0;
+static void InitGammaTablesS(void);

-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesS(void) {
+WEBP_DSP_INIT_FUNC(InitGammaTablesS) {
  assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
  if (!kGammaTablesSOk) {
    int v;

--- a/3rdparty/libwebp/src/enc/picture_tools_enc.c
+++ b/3rdparty/libwebp/src/enc/picture_tools_enc.c
@@ -83,6 +83,19 @@ static int SmoothenBlock(const uint8_t* a_ptr, int a_stride, uint8_t* y_ptr,
  return (count == 0);
 }

+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color) {
+  if (pic != NULL && pic->use_argb) {
+    int y = pic->height;
+    uint32_t* argb = pic->argb;
+    color &= 0xffffffu;   // force alpha=0
+    WebPInitAlphaProcessing();
+    while (y-- > 0) {
+      WebPAlphaReplace(argb, pic->width, color);
+      argb += pic->argb_stride;
+    }
+  }
+}
+
 void WebPCleanupTransparentArea(WebPPicture* pic) {
  int x, y, w, h;
  if (pic == NULL) return;
@@ -165,24 +178,6 @@ void WebPCleanupTransparentArea(WebPPicture* pic) {
 #undef SIZE
 #undef SIZE2

-void WebPCleanupTransparentAreaLossless(WebPPicture* const pic) {
-  int x, y, w, h;
-  uint32_t* argb;
-  assert(pic != NULL && pic->use_argb);
-  w = pic->width;
-  h = pic->height;
-  argb = pic->argb;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      if ((argb[x] & 0xff000000) == 0) {
-        argb[x] = 0x00000000;
-      }
-    }
-    argb += pic->argb_stride;
-  }
-}
-
 //------------------------------------------------------------------------------
 // Blend color and remove transparency info


--- a/3rdparty/libwebp/src/enc/vp8i_enc.h
+++ b/3rdparty/libwebp/src/enc/vp8i_enc.h
@@ -31,7 +31,7 @@ extern "C" {

 // version numbers
 #define ENC_MAJ_VERSION 1
-#define ENC_MIN_VERSION 1
+#define ENC_MIN_VERSION 2
 #define ENC_REV_VERSION 0

 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
@@ -505,9 +505,9 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
 // Returns false in case of error (invalid param, out-of-memory).
 int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);

-// Clean-up the RGB samples under fully transparent area, to help lossless
-// compressibility (no guarantee, though). Assumes that pic->use_argb is true.
-void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
+// Replace samples that are fully transparent by 'color' to help compressibility
+// (no guarantee, though). Assumes pic->use_argb is true.
+void WebPReplaceTransparentPixels(WebPPicture* const pic, uint32_t color);

 //------------------------------------------------------------------------------


--- a/3rdparty/libwebp/src/enc/vp8l_enc.c
+++ b/3rdparty/libwebp/src/enc/vp8l_enc.c
--- a/3rdparty/libwebp/src/enc/vp8li_enc.h
+++ b/3rdparty/libwebp/src/enc/vp8li_enc.h
@@ -71,7 +71,7 @@ typedef struct {
  uint32_t palette_[MAX_PALETTE_SIZE];

  // Some 'scratch' (potentially large) objects.
-  struct VP8LBackwardRefs refs_[3];  // Backward Refs array for temporaries.
+  struct VP8LBackwardRefs refs_[4];  // Backward Refs array for temporaries.
  VP8LHashChain hash_chain_;         // HashChain data for constructing
                                     // backward references.
 } VP8LEncoder;

--- a/3rdparty/libwebp/src/enc/webp_enc.c
+++ b/3rdparty/libwebp/src/enc/webp_enc.c
@@ -400,7 +400,7 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
    }

    if (!config->exact) {
-      WebPCleanupTransparentAreaLossless(pic);
+      WebPReplaceTransparentPixels(pic, 0x000000);
    }

    ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.

--- a/3rdparty/libwebp/src/mux/muxi.h
+++ b/3rdparty/libwebp/src/mux/muxi.h
@@ -28,7 +28,7 @@ extern "C" {
 // Defines and constants.

 #define MUX_MAJ_VERSION 1
-#define MUX_MIN_VERSION 1
+#define MUX_MIN_VERSION 2
 #define MUX_REV_VERSION 0

 // Chunk object.

--- a/3rdparty/libwebp/src/mux/muxread.c
+++ b/3rdparty/libwebp/src/mux/muxread.c
@@ -155,7 +155,6 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
        break;
      default:
        goto Fail;
-        break;
    }
    subchunk_size = ChunkDiskSize(&subchunk);
    bytes += subchunk_size;
@@ -264,7 +263,6 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
        if (!MuxImageParse(&chunk, copy_data, wpi)) goto Err;
        ChunkRelease(&chunk);
        goto PushImage;
-        break;
      default:  // A non-image chunk.
        if (wpi->is_partial_) goto Err;  // Encountered a non-image chunk before
                                         // getting all chunks of an image.

--- a/3rdparty/libwebp/src/utils/utils.c
+++ b/3rdparty/libwebp/src/utils/utils.c
@@ -231,7 +231,7 @@ void WebPFree(void* ptr) {
 void WebPCopyPlane(const uint8_t* src, int src_stride,
                   uint8_t* dst, int dst_stride, int width, int height) {
  assert(src != NULL && dst != NULL);
-  assert(src_stride >= width && dst_stride >= width);
+  assert(abs(src_stride) >= width && abs(dst_stride) >= width);
  while (height-- > 0) {
    memcpy(dst, src, width);
    src += src_stride;

--- a/3rdparty/libwebp/src/webp/decode.h
+++ b/3rdparty/libwebp/src/webp/decode.h
@@ -453,7 +453,7 @@ struct WebPDecoderOptions {
  int scaled_width, scaled_height;    // final resolution
  int use_threads;                    // if true, use multi-threaded decoding
  int dithering_strength;             // dithering strength (0=Off, 100=full)
-  int flip;                           // flip output vertically
+  int flip;                           // if true, flip output vertically
  int alpha_dithering_strength;       // alpha dithering strength in [0..100]

  uint32_t pad[5];                    // padding for later use

--- a/3rdparty/libwebp/src/webp/encode.h
+++ b/3rdparty/libwebp/src/webp/encode.h
@@ -148,7 +148,8 @@ struct WebPConfig {
  int use_delta_palette;  // reserved for future lossless feature
  int use_sharp_yuv;      // if needed, use sharp (and slow) RGB->YUV conversion

-  uint32_t pad[2];        // padding for later use
+  int qmin;               // minimum permissible quality factor
+  int qmax;               // maximum permissible quality factor
 };

 // Enumerate some predefined settings for WebPConfig, depending on the type
@@ -291,6 +292,11 @@ typedef enum WebPEncodingError {
 #define WEBP_MAX_DIMENSION 16383

 // Main exchange structure (input samples, output bytes, statistics)
+//
+// Once WebPPictureInit() has been called, it's ok to make all the INPUT fields
+// (use_argb, y/u/v, argb, ...) point to user-owned data, even if
+// WebPPictureAlloc() has been called. Depending on the value use_argb,
+// it's guaranteed that either *argb or *y/*u/*v content will be kept untouched.
 struct WebPPicture {
  //   INPUT
  //////////////