dbms: using in-tree re2 library [#METR-17973].

7d3d51f8 · Alexey Milovidov · 21d37dba · 7d3d51f8 · 7d3d51f8 · 7d3d51f8
64 changed file
--- a/contrib/libre2/AUTHORS
+++ b/contrib/libre2/AUTHORS
+# This is the official list of RE2 authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+
+# Names should be added to this file as
+#	Name or Organization <email address>
+# The email address is not required for organizations.
+
+# Please keep the list sorted.
+
+Google Inc.
+Samsung Electronics
+Stefano Rivera <stefano.rivera@gmail.com>
--- a/contrib/libre2/CONTRIBUTORS
+++ b/contrib/libre2/CONTRIBUTORS
+# This is the official list of people who can contribute
+# (and typically have contributed) code to the RE2 repository.
+# The AUTHORS file lists the copyright holders; this file
+# lists people.  For example, Google employees are listed here
+# but not in AUTHORS, because Google holds the copyright.
+#
+# The submission process automatically checks to make sure
+# that people submitting code are listed in this file (by email address).
+#
+# Names should be added to this file only after verifying that
+# the individual or the individual's organization has agreed to
+# the appropriate Contributor License Agreement, found here:
+#
+#     http://code.google.com/legal/individual-cla-v1.0.html
+#     http://code.google.com/legal/corporate-cla-v1.0.html
+#
+# The agreement for individuals can be filled out on the web.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file, depending on whether the
+# individual or corporate CLA was used.
+
+# Names should be added to this file like so:
+#     Name <email address>
+
+# Please keep the list sorted.
+
+Dominic Battré <battre@chromium.org>
+Dmitriy Vyukov <dvyukov@google.com>
+John Millikin <jmillikin@gmail.com>
+Mike Nazarewicz <mpn@google.com>
+Pawel Hajdan <phajdan.jr@gmail.com>
+Rob Pike <r@google.com>
+Russ Cox <rsc@swtch.com>
+Sanjay Ghemawat <sanjay@google.com>
+Stefano Rivera <stefano.rivera@gmail.com>
+Srinivasan Venkatachary <vsri@google.com>
+Viatcheslav Ostapenko <sl.ostapenko@samsung.com>
--- a/contrib/libre2/LICENSE
+++ b/contrib/libre2/LICENSE
+// Copyright (c) 2009 The RE2 Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/contrib/libre2/README
+++ b/contrib/libre2/README
+This is the source code repository for RE2, a regular expression library.
+
+For documentation about how to install and use RE2,
+visit http://code.google.com/p/re2/.
+
+The short version is:
+
+make
+make test
+make install
+make testinstall
+
+Unless otherwise noted, the RE2 source files are distributed
+under the BSD-style license found in the LICENSE file.
+
+RE2's native language is C++.
+An Inferno wrapper is at http://code.google.com/p/inferno-re2/.
+A Python wrapper is at http://github.com/facebook/pyre2/.
+A Ruby wrapper is at http://github.com/axic/rre2/.
--- a/contrib/libre2/re2/bitstate.cc
+++ b/contrib/libre2/re2/bitstate.cc
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tested by search_test.cc, exhaustive_test.cc, tester.cc
+
+// Prog::SearchBitState is a regular expression search with submatch
+// tracking for small regular expressions and texts.  Like
+// testing/backtrack.cc, it allocates a bit vector with (length of
+// text) * (length of prog) bits, to make sure it never explores the
+// same (character position, instruction) state multiple times.  This
+// limits the search to run in time linear in the length of the text.
+//
+// Unlike testing/backtrack.cc, SearchBitState is not recursive
+// on the text.
+//
+// SearchBitState is a fast replacement for the NFA code on small
+// regexps and texts when SearchOnePass cannot be used.
+
+#include "re2/prog.h"
+#include "re2/regexp.h"
+
+namespace re2 {
+
+struct Job {
+  int id;
+  int arg;
+  const char* p;
+};
+
+class BitState {
+ public:
+  explicit BitState(Prog* prog);
+  ~BitState();
+
+  // The usual Search prototype.
+  // Can only call Search once per BitState.
+  bool Search(const StringPiece& text, const StringPiece& context,
+              bool anchored, bool longest,
+              StringPiece* submatch, int nsubmatch);
+
+ private:
+  inline bool ShouldVisit(int id, const char* p);
+  void Push(int id, const char* p, int arg);
+  bool GrowStack();
+  bool TrySearch(int id, const char* p);
+
+  // Search parameters
+  Prog* prog_;              // program being run
+  StringPiece text_;        // text being searched
+  StringPiece context_;     // greater context of text being searched
+  bool anchored_;           // whether search is anchored at text.begin()
+  bool longest_;            // whether search wants leftmost-longest match
+  bool endmatch_;           // whether match must end at text.end()
+  StringPiece *submatch_;   // submatches to fill in
+  int nsubmatch_;           //   # of submatches to fill in
+
+  // Search state
+  const char** cap_;        // capture registers
+  int ncap_;
+
+  static const int VisitedBits = 32;
+  uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
+  int nvisited_;            //   # of words in bitmap
+
+  Job *job_;                // stack of text positions to explore
+  int njob_;
+  int maxjob_;
+};
+
+BitState::BitState(Prog* prog)
+  : prog_(prog),
+    anchored_(false),
+    longest_(false),
+    endmatch_(false),
+    submatch_(NULL),
+    nsubmatch_(0),
+    cap_(NULL),
+    ncap_(0),
+    visited_(NULL),
+    nvisited_(0),
+    job_(NULL),
+    njob_(0),
+    maxjob_(0) {
+}
+
+BitState::~BitState() {
+  delete[] visited_;
+  delete[] job_;
+  delete[] cap_;
+}
+
+// Should the search visit the pair ip, p?
+// If so, remember that it was visited so that the next time,
+// we don't repeat the visit.
+bool BitState::ShouldVisit(int id, const char* p) {
+  uint n = id * (text_.size() + 1) + (p - text_.begin());
+  if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
+    return false;
+  visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
+  return true;
+}
+
+// Grow the stack.
+bool BitState::GrowStack() {
+  // VLOG(0) << "Reallocate.";
+  maxjob_ *= 2;
+  Job* newjob = new Job[maxjob_];
+  memmove(newjob, job_, njob_*sizeof job_[0]);
+  delete[] job_;
+  job_ = newjob;
+  if (njob_ >= maxjob_) {
+    LOG(DFATAL) << "Job stack overflow.";
+    return false;
+  }
+  return true;
+}
+
+// Push the triple (id, p, arg) onto the stack, growing it if necessary.
+void BitState::Push(int id, const char* p, int arg) {
+  if (njob_ >= maxjob_) {
+    if (!GrowStack())
+      return;
+  }
+  int op = prog_->inst(id)->opcode();
+  if (op == kInstFail)
+    return;
+
+  // Only check ShouldVisit when arg == 0.
+  // When arg > 0, we are continuing a previous visit.
+  if (arg == 0 && !ShouldVisit(id, p))
+    return;
+
+  Job* j = &job_[njob_++];
+  j->id = id;
+  j->p = p;
+  j->arg = arg;
+}
+
+// Try a search from instruction id0 in state p0.
+// Return whether it succeeded.
+bool BitState::TrySearch(int id0, const char* p0) {
+  bool matched = false;
+  const char* end = text_.end();
+  njob_ = 0;
+  Push(id0, p0, 0);
+  while (njob_ > 0) {
+    // Pop job off stack.
+    --njob_;
+    int id = job_[njob_].id;
+    const char* p = job_[njob_].p;
+    int arg = job_[njob_].arg;
+
+    // Optimization: rather than push and pop,
+    // code that is going to Push and continue
+    // the loop simply updates ip, p, and arg
+    // and jumps to CheckAndLoop.  We have to
+    // do the ShouldVisit check that Push
+    // would have, but we avoid the stack
+    // manipulation.
+    if (0) {
+    CheckAndLoop:
+      if (!ShouldVisit(id, p))
+        continue;
+    }
+
+    // Visit ip, p.
+    // VLOG(0) << "Job: " << ip->id() << " "
+    //         << (p - text_.begin()) << " " << arg;
+    Prog::Inst* ip = prog_->inst(id);
+    switch (ip->opcode()) {
+      case kInstFail:
+      default:
+        LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
+        return false;
+
+      case kInstAlt:
+        // Cannot just
+        //   Push(ip->out1(), p, 0);
+        //   Push(ip->out(), p, 0);
+        // If, during the processing of ip->out(), we encounter
+        // ip->out1() via another path, we want to process it then.
+        // Pushing it here will inhibit that.  Instead, re-push
+        // ip with arg==1 as a reminder to push ip->out1() later.
+        switch (arg) {
+          case 0:
+            Push(id, p, 1);  // come back when we're done
+            id = ip->out();
+            goto CheckAndLoop;
+
+          case 1:
+            // Finished ip->out(); try ip->out1().
+            arg = 0;
+            id = ip->out1();
+            goto CheckAndLoop;
+        }
+        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
+        continue;
+
+      case kInstAltMatch:
+        // One opcode is byte range; the other leads to match.
+        if (ip->greedy(prog_)) {
+          // out1 is the match
+          Push(ip->out1(), p, 0);
+          id = ip->out1();
+          p = end;
+          goto CheckAndLoop;
+        }
+        // out is the match - non-greedy
+        Push(ip->out(), end, 0);
+        id = ip->out();
+        goto CheckAndLoop;
+
+      case kInstByteRange: {
+        int c = -1;
+        if (p < end)
+          c = *p & 0xFF;
+        if (ip->Matches(c)) {
+          id = ip->out();
+          p++;
+          goto CheckAndLoop;
+        }
+        continue;
+      }
+
+      case kInstCapture:
+        switch (arg) {
+          case 0:
+            if (0 <= ip->cap() && ip->cap() < ncap_) {
+              // Capture p to register, but save old value.
+              Push(id, cap_[ip->cap()], 1);  // come back when we're done
+              cap_[ip->cap()] = p;
+            }
+            // Continue on.
+            id = ip->out();
+            goto CheckAndLoop;
+          case 1:
+            // Finished ip->out(); restore the old value.
+            cap_[ip->cap()] = p;
+            continue;
+        }
+        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
+        continue;
+
+      case kInstEmptyWidth:
+        if (ip->empty() & ~Prog::EmptyFlags(context_, p))
+          continue;
+        id = ip->out();
+        goto CheckAndLoop;
+
+      case kInstNop:
+        id = ip->out();
+        goto CheckAndLoop;
+
+      case kInstMatch: {
+        if (endmatch_ && p != text_.end())
+          continue;
+
+        // VLOG(0) << "Found match.";
+        // We found a match.  If the caller doesn't care
+        // where the match is, no point going further.
+        if (nsubmatch_ == 0)
+          return true;
+
+        // Record best match so far.
+        // Only need to check end point, because this entire
+        // call is only considering one start position.
+        matched = true;
+        cap_[1] = p;
+        if (submatch_[0].data() == NULL ||
+            (longest_ && p > submatch_[0].end())) {
+          for (int i = 0; i < nsubmatch_; i++)
+            submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
+        }
+
+        // If going for first match, we're done.
+        if (!longest_)
+          return true;
+
+        // If we used the entire text, no longer match is possible.
+        if (p == text_.end())
+          return true;
+
+        // Otherwise, continue on in hope of a longer match.
+        continue;
+      }
+    }
+  }
+  return matched;
+}
+
+// Search text (within context) for prog_.
+bool BitState::Search(const StringPiece& text, const StringPiece& context,
+                      bool anchored, bool longest,
+                      StringPiece* submatch, int nsubmatch) {
+  // Search parameters.
+  text_ = text;
+  context_ = context;
+  if (context_.begin() == NULL)
+    context_ = text;
+  if (prog_->anchor_start() && context_.begin() != text.begin())
+    return false;
+  if (prog_->anchor_end() && context_.end() != text.end())
+    return false;
+  anchored_ = anchored || prog_->anchor_start();
+  longest_ = longest || prog_->anchor_end();
+  endmatch_ = prog_->anchor_end();
+  submatch_ = submatch;
+  nsubmatch_ = nsubmatch;
+  for (int i = 0; i < nsubmatch_; i++)
+    submatch_[i] = NULL;
+
+  // Allocate scratch space.
+  nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
+  visited_ = new uint32[nvisited_];
+  memset(visited_, 0, nvisited_*sizeof visited_[0]);
+  // VLOG(0) << "nvisited_ = " << nvisited_;
+
+  ncap_ = 2*nsubmatch;
+  if (ncap_ < 2)
+    ncap_ = 2;
+  cap_ = new const char*[ncap_];
+  memset(cap_, 0, ncap_*sizeof cap_[0]);
+
+  maxjob_ = 256;
+  job_ = new Job[maxjob_];
+
+  // Anchored search must start at text.begin().
+  if (anchored_) {
+    cap_[0] = text.begin();
+    return TrySearch(prog_->start(), text.begin());
+  }
+
+  // Unanchored search, starting from each possible text position.
+  // Notice that we have to try the empty string at the end of
+  // the text, so the loop condition is p <= text.end(), not p < text.end().
+  // This looks like it's quadratic in the size of the text,
+  // but we are not clearing visited_ between calls to TrySearch,
+  // so no work is duplicated and it ends up still being linear.
+  for (const char* p = text.begin(); p <= text.end(); p++) {
+    cap_[0] = p;
+    if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
+      return true;
+  }
+  return false;
+}
+
+// Bit-state search.
+bool Prog::SearchBitState(const StringPiece& text,
+                          const StringPiece& context,
+                          Anchor anchor,
+                          MatchKind kind,
+                          StringPiece* match,
+                          int nmatch) {
+  // If full match, we ask for an anchored longest match
+  // and then check that match[0] == text.
+  // So make sure match[0] exists.
+  StringPiece sp0;
+  if (kind == kFullMatch) {
+    anchor = kAnchored;
+    if (nmatch < 1) {
+      match = &sp0;
+      nmatch = 1;
+    }
+  }
+
+  // Run the search.
+  BitState b(this);
+  bool anchored = anchor == kAnchored;
+  bool longest = kind != kFirstMatch;
+  if (!b.Search(text, context, anchored, longest, match, nmatch))
+    return false;
+  if (kind == kFullMatch && match[0].end() != text.end())
+    return false;
+  return true;
+}
+
+}  // namespace re2
--- a/contrib/libre2/re2/compile.cc
+++ b/contrib/libre2/re2/compile.cc
--- a/contrib/libre2/re2/dfa.cc
+++ b/contrib/libre2/re2/dfa.cc
--- a/contrib/libre2/re2/filtered_re2.cc
+++ b/contrib/libre2/re2/filtered_re2.cc
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string>
+#include "util/util.h"
+#include "re2/filtered_re2.h"
+#include "re2/prefilter.h"
+#include "re2/prefilter_tree.h"
+
+namespace re2 {
+
+FilteredRE2::FilteredRE2()
+    : compiled_(false),
+      prefilter_tree_(new PrefilterTree()) {
+}
+
+FilteredRE2::~FilteredRE2() {
+  for (int i = 0; i < re2_vec_.size(); i++)
+    delete re2_vec_[i];
+  delete prefilter_tree_;
+}
+
+RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
+                                const RE2::Options& options, int* id) {
+  RE2* re = new RE2(pattern, options);
+  RE2::ErrorCode code = re->error_code();
+
+  if (!re->ok()) {
+    if (options.log_errors()) {
+      LOG(ERROR) << "Couldn't compile regular expression, skipping: "
+                 << re << " due to error " << re->error();
+    }
+    delete re;
+  } else {
+    *id = re2_vec_.size();
+    re2_vec_.push_back(re);
+  }
+
+  return code;
+}
+
+void FilteredRE2::Compile(vector<string>* atoms) {
+  if (compiled_ || re2_vec_.size() == 0) {
+    LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
+    return;
+  }
+
+  for (int i = 0; i < re2_vec_.size(); i++) {
+    Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
+    prefilter_tree_->Add(prefilter);
+  }
+  atoms->clear();
+  prefilter_tree_->Compile(atoms);
+  compiled_ = true;
+}
+
+int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
+  for (int i = 0; i < re2_vec_.size(); i++)
+    if (RE2::PartialMatch(text, *re2_vec_[i]))
+      return i;
+  return -1;
+}
+
+int FilteredRE2::FirstMatch(const StringPiece& text,
+                            const vector<int>& atoms) const {
+  if (!compiled_) {
+    LOG(DFATAL) << "FirstMatch called before Compile";
+    return -1;
+  }
+  vector<int> regexps;
+  prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
+  for (int i = 0; i < regexps.size(); i++)
+    if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
+      return regexps[i];
+  return -1;
+}
+
+bool FilteredRE2::AllMatches(
+    const StringPiece& text,
+    const vector<int>& atoms,
+    vector<int>* matching_regexps) const {
+  matching_regexps->clear();
+  vector<int> regexps;
+  prefilter_tree_->RegexpsGivenStrings(atoms, &regexps);
+  for (int i = 0; i < regexps.size(); i++)
+    if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
+      matching_regexps->push_back(regexps[i]);
+  return !matching_regexps->empty();
+}
+
+void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
+                                      vector<int>* passed_regexps) {
+  prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
+}
+
+
+void FilteredRE2::PrintPrefilter(int regexpid) {
+  prefilter_tree_->PrintPrefilter(regexpid);
+}
+
+}  // namespace re2
--- a/contrib/libre2/re2/filtered_re2.h
+++ b/contrib/libre2/re2/filtered_re2.h
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
+// It provides a prefilter mechanism that helps in cutting down the
+// number of regexps that need to be actually searched.
+//
+// By design, it does not include a string matching engine. This is to
+// allow the user of the class to use their favorite string match
+// engine. The overall flow is: Add all the regexps using Add, then
+// Compile the FilteredRE2. The compile returns strings that need to
+// be matched. Note that all returned strings are lowercase. For
+// applying regexps to a search text, the caller does the string
+// matching using the strings returned. When doing the string match,
+// note that the caller has to do that on lower cased version of the
+// search text. Then call FirstMatch or AllMatches with a vector of
+// indices of strings that were found in the text to get the actual
+// regexp matches.
+
+#ifndef RE2_FILTERED_RE2_H_
+#define RE2_FILTERED_RE2_H_
+
+#include <vector>
+#include "re2/re2.h"
+
+namespace re2 {
+using std::vector;
+
+class PrefilterTree;
+
+class FilteredRE2 {
+ public:
+  FilteredRE2();
+  ~FilteredRE2();
+
+  // Uses RE2 constructor to create a RE2 object (re). Returns
+  // re->error_code(). If error_code is other than NoError, then re is
+  // deleted and not added to re2_vec_.
+  RE2::ErrorCode Add(const StringPiece& pattern,
+                     const RE2::Options& options,
+                     int *id);
+
+  // Prepares the regexps added by Add for filtering.  Returns a set
+  // of strings that the caller should check for in candidate texts.
+  // The returned strings are lowercased. When doing string matching,
+  // the search text should be lowercased first to find matching
+  // strings from the set of strings returned by Compile.  Call after
+  // all Add calls are done.
+  void Compile(vector<string>* strings_to_match);
+
+  // Returns the index of the first matching regexp.
+  // Returns -1 on no match. Can be called prior to Compile.
+  // Does not do any filtering: simply tries to Match the
+  // regexps in a loop.
+  int SlowFirstMatch(const StringPiece& text) const;
+
+  // Returns the index of the first matching regexp.
+  // Returns -1 on no match. Compile has to be called before
+  // calling this.
+  int FirstMatch(const StringPiece& text,
+                 const vector<int>& atoms) const;
+
+  // Returns the indices of all matching regexps, after first clearing
+  // matched_regexps.
+  bool AllMatches(const StringPiece& text,
+                  const vector<int>& atoms,
+                  vector<int>* matching_regexps) const;
+
+  // The number of regexps added.
+  int NumRegexps() const { return re2_vec_.size(); }
+
+ private:
+
+  // Get the individual RE2 objects. Useful for testing.
+  RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
+
+  // Print prefilter.
+  void PrintPrefilter(int regexpid);
+
+  // Useful for testing and debugging.
+  void RegexpsGivenStrings(const vector<int>& matched_atoms,
+                           vector<int>* passed_regexps);
+
+  // All the regexps in the FilteredRE2.
+  vector<RE2*> re2_vec_;
+
+  // Has the FilteredRE2 been compiled using Compile()
+  bool compiled_;
+
+  // An AND-OR tree of string atoms used for filtering regexps.
+  PrefilterTree* prefilter_tree_;
+
+  //DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
+  FilteredRE2(const FilteredRE2&);
+  void operator=(const FilteredRE2&);
+};
+
+}  // namespace re2
+
+#endif  // RE2_FILTERED_RE2_H_
--- a/contrib/libre2/re2/mimics_pcre.cc
+++ b/contrib/libre2/re2/mimics_pcre.cc
+// Copyright 2008 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Determine whether this library should match PCRE exactly
+// for a particular Regexp.  (If so, the testing framework can
+// check that it does.)
+//
+// This library matches PCRE except in these cases:
+//   * the regexp contains a repetition of an empty string,
+//     like (a*)* or (a*)+.  In this case, PCRE will treat
+//     the repetition sequence as ending with an empty string,
+//     while this library does not.
+//   * Perl and PCRE differ on whether \v matches \n.
+//     For historical reasons, this library implements the Perl behavior.
+//   * Perl and PCRE allow $ in one-line mode to match either the very
+//     end of the text or just before a \n at the end of the text.
+//     This library requires it to match only the end of the text.
+//   * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
+//     match the end of the text if the last character is a \n.
+//     This library does allow it.
+//
+// Regexp::MimicsPCRE checks for any of these conditions.
+
+#include "util/util.h"
+#include "re2/regexp.h"
+#include "re2/walker-inl.h"
+
+namespace re2 {
+
+// Returns whether re might match an empty string.
+static bool CanBeEmptyString(Regexp *re);
+
+// Walker class to compute whether library handles a regexp
+// exactly as PCRE would.  See comment at top for conditions.
+
+class PCREWalker : public Regexp::Walker<bool> {
+ public:
+  PCREWalker() {}
+  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
+                 int nchild_args);
+
+  bool ShortVisit(Regexp* re, bool a) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
+    return a;
+  }
+};
+
+// Called after visiting each of re's children and accumulating
+// the return values in child_args.  So child_args contains whether
+// this library mimics PCRE for those subexpressions.
+bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+                           bool* child_args, int nchild_args) {
+  // If children failed, so do we.
+  for (int i = 0; i < nchild_args; i++)
+    if (!child_args[i])
+      return false;
+
+  // Otherwise look for other reasons to fail.
+  switch (re->op()) {
+    // Look for repeated empty string.
+    case kRegexpStar:
+    case kRegexpPlus:
+    case kRegexpQuest:
+      if (CanBeEmptyString(re->sub()[0]))
+        return false;
+      break;
+    case kRegexpRepeat:
+      if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
+        return false;
+      break;
+
+    // Look for \v
+    case kRegexpLiteral:
+      if (re->rune() == '\v')
+        return false;
+      break;
+
+    // Look for $ in single-line mode.
+    case kRegexpEndText:
+    case kRegexpEmptyMatch:
+      if (re->parse_flags() & Regexp::WasDollar)
+        return false;
+      break;
+
+    // Look for ^ in multi-line mode.
+    case kRegexpBeginLine:
+      // No condition: in single-line mode ^ becomes kRegexpBeginText.
+      return false;
+
+    default:
+      break;
+  }
+
+  // Not proven guilty.
+  return true;
+}
+
+// Returns whether this regexp's behavior will mimic PCRE's exactly.
+bool Regexp::MimicsPCRE() {
+  PCREWalker w;
+  return w.Walk(this, true);
+}
+
+
+// Walker class to compute whether a Regexp can match an empty string.
+// It is okay to overestimate.  For example, \b\B cannot match an empty
+// string, because \b and \B are mutually exclusive, but this isn't
+// that smart and will say it can.  Spurious empty strings
+// will reduce the number of regexps we sanity check against PCRE,
+// but they won't break anything.
+
+class EmptyStringWalker : public Regexp::Walker<bool> {
+ public:
+  EmptyStringWalker() { }
+  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+                 bool* child_args, int nchild_args);
+
+  bool ShortVisit(Regexp* re, bool a) {
+    // Should never be called: we use Walk not WalkExponential.
+    LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
+    return a;
+  }
+
+ private:
+  DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
+};
+
+// Called after visiting re's children.  child_args contains the return
+// value from each of the children's PostVisits (i.e., whether each child
+// can match an empty string).  Returns whether this clause can match an
+// empty string.
+bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
+                                  bool* child_args, int nchild_args) {
+  switch (re->op()) {
+    case kRegexpNoMatch:               // never empty
+    case kRegexpLiteral:
+    case kRegexpAnyChar:
+    case kRegexpAnyByte:
+    case kRegexpCharClass:
+    case kRegexpLiteralString:
+      return false;
+
+    case kRegexpEmptyMatch:            // always empty
+    case kRegexpBeginLine:             // always empty, when they match
+    case kRegexpEndLine:
+    case kRegexpNoWordBoundary:
+    case kRegexpWordBoundary:
+    case kRegexpBeginText:
+    case kRegexpEndText:
+    case kRegexpStar:                  // can always be empty
+    case kRegexpQuest:
+    case kRegexpHaveMatch:
+      return true;
+
+    case kRegexpConcat:                // can be empty if all children can
+      for (int i = 0; i < nchild_args; i++)
+        if (!child_args[i])
+          return false;
+      return true;
+
+    case kRegexpAlternate:             // can be empty if any child can
+      for (int i = 0; i < nchild_args; i++)
+        if (child_args[i])
+          return true;
+      return false;
+
+    case kRegexpPlus:                  // can be empty if the child can
+    case kRegexpCapture:
+      return child_args[0];
+
+    case kRegexpRepeat:                // can be empty if child can or is x{0}
+      return child_args[0] || re->min() == 0;
+  }
+  return false;
+}
+
+// Returns whether re can match an empty string.
+static bool CanBeEmptyString(Regexp* re) {
+  EmptyStringWalker w;
+  return w.Walk(re, true);
+}
+
+}  // namespace re2
--- a/contrib/libre2/re2/nfa.cc
+++ b/contrib/libre2/re2/nfa.cc
--- a/contrib/libre2/re2/onepass.cc
+++ b/contrib/libre2/re2/onepass.cc
--- a/contrib/libre2/re2/parse.cc
+++ b/contrib/libre2/re2/parse.cc
--- a/contrib/libre2/re2/perl_groups.cc
+++ b/contrib/libre2/re2/perl_groups.cc
+// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
+// make_perl_groups.pl >perl_groups.cc
+
+#include "re2/unicode_groups.h"
+
+namespace re2 {
+
+static const URange16 code1[] = {  /* \d */
+	{ 0x30, 0x39 },
+};
+static const URange16 code2[] = {  /* \s */
+	{ 0x9, 0xa },
+	{ 0xc, 0xd },
+	{ 0x20, 0x20 },
+};
+static const URange16 code3[] = {  /* \w */
+	{ 0x30, 0x39 },
+	{ 0x41, 0x5a },
+	{ 0x5f, 0x5f },
+	{ 0x61, 0x7a },
+};
+const UGroup perl_groups[] = {
+	{ "\\d", +1, code1, 1 },
+	{ "\\D", -1, code1, 1 },
+	{ "\\s", +1, code2, 3 },
+	{ "\\S", -1, code2, 3 },
+	{ "\\w", +1, code3, 4 },
+	{ "\\W", -1, code3, 4 },
+};
+const int num_perl_groups = 6;
+static const URange16 code4[] = {  /* [:alnum:] */
+	{ 0x30, 0x39 },
+	{ 0x41, 0x5a },
+	{ 0x61, 0x7a },
+};
+static const URange16 code5[] = {  /* [:alpha:] */
+	{ 0x41, 0x5a },
+	{ 0x61, 0x7a },
+};
+static const URange16 code6[] = {  /* [:ascii:] */
+	{ 0x0, 0x7f },
+};
+static const URange16 code7[] = {  /* [:blank:] */
+	{ 0x9, 0x9 },
+	{ 0x20, 0x20 },
+};
+static const URange16 code8[] = {  /* [:cntrl:] */
+	{ 0x0, 0x1f },
+	{ 0x7f, 0x7f },
+};
+static const URange16 code9[] = {  /* [:digit:] */
+	{ 0x30, 0x39 },
+};
+static const URange16 code10[] = {  /* [:graph:] */
+	{ 0x21, 0x7e },
+};
+static const URange16 code11[] = {  /* [:lower:] */
+	{ 0x61, 0x7a },
+};
+static const URange16 code12[] = {  /* [:print:] */
+	{ 0x20, 0x7e },
+};
+static const URange16 code13[] = {  /* [:punct:] */
+	{ 0x21, 0x2f },
+	{ 0x3a, 0x40 },
+	{ 0x5b, 0x60 },
+	{ 0x7b, 0x7e },
+};
+static const URange16 code14[] = {  /* [:space:] */
+	{ 0x9, 0xd },
+	{ 0x20, 0x20 },
+};
+static const URange16 code15[] = {  /* [:upper:] */
+	{ 0x41, 0x5a },
+};
+static const URange16 code16[] = {  /* [:word:] */
+	{ 0x30, 0x39 },
+	{ 0x41, 0x5a },
+	{ 0x5f, 0x5f },
+	{ 0x61, 0x7a },
+};
+static const URange16 code17[] = {  /* [:xdigit:] */
+	{ 0x30, 0x39 },
+	{ 0x41, 0x46 },
+	{ 0x61, 0x66 },
+};
+const UGroup posix_groups[] = {
+	{ "[:alnum:]", +1, code4, 3 },
+	{ "[:^alnum:]", -1, code4, 3 },
+	{ "[:alpha:]", +1, code5, 2 },
+	{ "[:^alpha:]", -1, code5, 2 },
+	{ "[:ascii:]", +1, code6, 1 },
+	{ "[:^ascii:]", -1, code6, 1 },
+	{ "[:blank:]", +1, code7, 2 },
+	{ "[:^blank:]", -1, code7, 2 },
+	{ "[:cntrl:]", +1, code8, 2 },
+	{ "[:^cntrl:]", -1, code8, 2 },
+	{ "[:digit:]", +1, code9, 1 },
+	{ "[:^digit:]", -1, code9, 1 },
+	{ "[:graph:]", +1, code10, 1 },
+	{ "[:^graph:]", -1, code10, 1 },
+	{ "[:lower:]", +1, code11, 1 },
+	{ "[:^lower:]", -1, code11, 1 },
+	{ "[:print:]", +1, code12, 1 },
+	{ "[:^print:]", -1, code12, 1 },
+	{ "[:punct:]", +1, code13, 4 },
+	{ "[:^punct:]", -1, code13, 4 },
+	{ "[:space:]", +1, code14, 2 },
+	{ "[:^space:]", -1, code14, 2 },
+	{ "[:upper:]", +1, code15, 1 },
+	{ "[:^upper:]", -1, code15, 1 },
+	{ "[:word:]", +1, code16, 4 },
+	{ "[:^word:]", -1, code16, 4 },
+	{ "[:xdigit:]", +1, code17, 3 },
+	{ "[:^xdigit:]", -1, code17, 3 },
+};
+const int num_posix_groups = 28;
+
+}  // namespace re2
--- a/contrib/libre2/re2/prefilter.cc
+++ b/contrib/libre2/re2/prefilter.cc
--- a/contrib/libre2/re2/prefilter.h
+++ b/contrib/libre2/re2/prefilter.h
--- a/contrib/libre2/re2/prefilter_tree.cc
+++ b/contrib/libre2/re2/prefilter_tree.cc
--- a/contrib/libre2/re2/prefilter_tree.h
+++ b/contrib/libre2/re2/prefilter_tree.h
--- a/contrib/libre2/re2/prog.cc
+++ b/contrib/libre2/re2/prog.cc
--- a/contrib/libre2/re2/prog.h
+++ b/contrib/libre2/re2/prog.h
--- a/contrib/libre2/re2/re2.cc
+++ b/contrib/libre2/re2/re2.cc
--- a/contrib/libre2/re2/re2.h
+++ b/contrib/libre2/re2/re2.h
--- a/contrib/libre2/re2/regexp.cc
+++ b/contrib/libre2/re2/regexp.cc
--- a/contrib/libre2/re2/regexp.h
+++ b/contrib/libre2/re2/regexp.h
--- a/contrib/libre2/re2/set.cc
+++ b/contrib/libre2/re2/set.cc
--- a/contrib/libre2/re2/set.h
+++ b/contrib/libre2/re2/set.h
--- a/contrib/libre2/re2/simplify.cc
+++ b/contrib/libre2/re2/simplify.cc
--- a/contrib/libre2/re2/stringpiece.h
+++ b/contrib/libre2/re2/stringpiece.h
--- a/contrib/libre2/re2/tostring.cc
+++ b/contrib/libre2/re2/tostring.cc
--- a/contrib/libre2/re2/unicode.py
+++ b/contrib/libre2/re2/unicode.py
--- a/contrib/libre2/re2/unicode_casefold.cc
+++ b/contrib/libre2/re2/unicode_casefold.cc
--- a/contrib/libre2/re2/unicode_casefold.h
+++ b/contrib/libre2/re2/unicode_casefold.h
--- a/contrib/libre2/re2/unicode_groups.cc
+++ b/contrib/libre2/re2/unicode_groups.cc
--- a/contrib/libre2/re2/unicode_groups.h
+++ b/contrib/libre2/re2/unicode_groups.h
--- a/contrib/libre2/re2/variadic_function.h
+++ b/contrib/libre2/re2/variadic_function.h
--- a/contrib/libre2/re2/walker-inl.h
+++ b/contrib/libre2/re2/walker-inl.h
--- a/contrib/libre2/util/arena.cc
+++ b/contrib/libre2/util/arena.cc
--- a/contrib/libre2/util/arena.h
+++ b/contrib/libre2/util/arena.h
--- a/contrib/libre2/util/atomicops.h
+++ b/contrib/libre2/util/atomicops.h
--- a/contrib/libre2/util/benchmark.cc
+++ b/contrib/libre2/util/benchmark.cc
--- a/contrib/libre2/util/benchmark.h
+++ b/contrib/libre2/util/benchmark.h
+// Copyright 2009 The RE2 Authors.  All Rights Reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef RE2_UTIL_BENCHMARK_H__
+#define RE2_UTIL_BENCHMARK_H__
+
+namespace testing {
+struct Benchmark {
+  const char* name;
+  void (*fn)(int);
+  void (*fnr)(int, int);
+  int lo;
+  int hi;
+  int threadlo;
+  int threadhi;
+  
+  void Register();
+  Benchmark(const char* name, void (*f)(int)) { Clear(name); fn = f; Register(); }
+  Benchmark(const char* name, void (*f)(int, int), int l, int h) { Clear(name); fnr = f; lo = l; hi = h; Register(); }
+  void Clear(const char* n) { name = n; fn = 0; fnr = 0; lo = 0; hi = 0; threadlo = 0; threadhi = 0; }
+  Benchmark* ThreadRange(int lo, int hi) { threadlo = lo; threadhi = hi; return this; }
+};
+}  // namespace testing
+
+void SetBenchmarkBytesProcessed(long long);
+void StopBenchmarkTiming();
+void StartBenchmarkTiming();
+void BenchmarkMemoryUsage();
+void SetBenchmarkItemsProcessed(int);
+
+int NumCPUs();
+
+#define BENCHMARK(f) \
+	::testing::Benchmark* _benchmark_##f = (new ::testing::Benchmark(#f, f))
+
+#define BENCHMARK_RANGE(f, lo, hi) \
+	::testing::Benchmark* _benchmark_##f = \
+	(new ::testing::Benchmark(#f, f, lo, hi))
+
+#endif  // RE2_UTIL_BENCHMARK_H__
--- a/contrib/libre2/util/flags.h
+++ b/contrib/libre2/util/flags.h
--- a/contrib/libre2/util/hash.cc
+++ b/contrib/libre2/util/hash.cc
--- a/contrib/libre2/util/logging.h
+++ b/contrib/libre2/util/logging.h
--- a/contrib/libre2/util/mutex.h
+++ b/contrib/libre2/util/mutex.h
--- a/contrib/libre2/util/pcre.cc
+++ b/contrib/libre2/util/pcre.cc
--- a/contrib/libre2/util/pcre.h
+++ b/contrib/libre2/util/pcre.h
--- a/contrib/libre2/util/random.cc
+++ b/contrib/libre2/util/random.cc
--- a/contrib/libre2/util/random.h
+++ b/contrib/libre2/util/random.h
--- a/contrib/libre2/util/rune.cc
+++ b/contrib/libre2/util/rune.cc
--- a/contrib/libre2/util/sparse_array.h
+++ b/contrib/libre2/util/sparse_array.h
--- a/contrib/libre2/util/sparse_array_test.cc
+++ b/contrib/libre2/util/sparse_array_test.cc
--- a/contrib/libre2/util/sparse_set.h
+++ b/contrib/libre2/util/sparse_set.h
--- a/contrib/libre2/util/stringpiece.cc
+++ b/contrib/libre2/util/stringpiece.cc
--- a/contrib/libre2/util/stringprintf.cc
+++ b/contrib/libre2/util/stringprintf.cc
--- a/contrib/libre2/util/strutil.cc
+++ b/contrib/libre2/util/strutil.cc
--- a/contrib/libre2/util/test.cc
+++ b/contrib/libre2/util/test.cc
--- a/contrib/libre2/util/test.h
+++ b/contrib/libre2/util/test.h
--- a/contrib/libre2/util/thread.cc
+++ b/contrib/libre2/util/thread.cc
--- a/contrib/libre2/util/thread.h
+++ b/contrib/libre2/util/thread.h
--- a/contrib/libre2/util/utf.h
+++ b/contrib/libre2/util/utf.h
--- a/contrib/libre2/util/util.h
+++ b/contrib/libre2/util/util.h
--- a/contrib/libre2/util/valgrind.cc
+++ b/contrib/libre2/util/valgrind.cc
--- a/contrib/libre2/util/valgrind.h
+++ b/contrib/libre2/util/valgrind.h