ripgrep: remove old code

上级 f16f9ced
......@@ -134,12 +134,12 @@ dependencies = [
[[package]]
name = "grep"
version = "0.1.9"
version = "0.2.0"
dependencies = [
"log 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
"grep-matcher 0.0.1",
"grep-printer 0.0.1",
"grep-regex 0.0.1",
"grep-searcher 0.0.1",
]
[[package]]
......@@ -192,16 +192,6 @@ dependencies = [
"regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "grep2"
version = "0.2.0"
dependencies = [
"grep-matcher 0.0.1",
"grep-printer 0.0.1",
"grep-regex 0.0.1",
"grep-searcher 0.0.1",
]
[[package]]
name = "ignore"
version = "0.4.3"
......@@ -339,19 +329,12 @@ name = "ripgrep"
version = "0.9.0"
dependencies = [
"atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"bytecount 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
"encoding_rs_io 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"globset 0.4.1",
"grep 0.1.9",
"grep2 0.2.0",
"grep 0.2.0",
"ignore 0.4.3",
"lazy_static 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
"num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"same-file 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
......
......@@ -35,23 +35,15 @@ path = "tests/tests.rs"
members = [
"grep", "globset", "ignore",
"grep-matcher", "grep-printer", "grep-regex", "grep-searcher",
"grep2",
]
[dependencies]
atty = "0.2.11"
bytecount = "0.3.1"
encoding_rs = "0.8"
encoding_rs_io = "0.1"
globset = { version = "0.4.0", path = "globset" }
grep = { version = "0.1.8", path = "grep" }
grep2 = { version = "0.2.0", path = "grep2" }
grep = { version = "0.2.0", path = "grep" }
ignore = { version = "0.4.0", path = "ignore" }
lazy_static = "1"
libc = "0.2"
log = "0.4"
memchr = "2"
memmap = "0.6"
num_cpus = "1"
regex = "1"
same-file = "1"
......@@ -75,15 +67,8 @@ default-features = false
features = ["suggestions", "color"]
[features]
avx-accel = [
"bytecount/avx-accel",
"grep2/avx-accel",
]
simd-accel = [
"bytecount/simd-accel",
"encoding_rs/simd-accel",
"grep2/simd-accel",
]
avx-accel = ["grep/avx-accel"]
simd-accel = ["grep/simd-accel"]
[profile.release]
debug = true
[package]
name = "grep"
version = "0.1.9" #:version
version = "0.2.0" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = """
Fast line oriented regex searching as a library.
......@@ -13,7 +13,11 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
license = "Unlicense/MIT"
[dependencies]
log = "0.4"
memchr = "2"
regex = "1"
regex-syntax = "0.6"
grep-matcher = { version = "0.0.1", path = "../grep-matcher" }
grep-printer = { version = "0.0.1", path = "../grep-printer" }
grep-regex = { version = "0.0.1", path = "../grep-regex" }
grep-searcher = { version = "0.0.1", path = "../grep-searcher" }
[features]
avx-accel = ["grep-searcher/avx-accel"]
simd-accel = ["grep-searcher/simd-accel"]
此差异已折叠。
#![deny(missing_docs)]
/*!
A fast line oriented regex searcher.
TODO.
*/
#[macro_use]
extern crate log;
extern crate memchr;
extern crate regex;
extern crate regex_syntax as syntax;
use std::error;
use std::fmt;
use std::result;
pub use search::{Grep, GrepBuilder, Iter, Match};
mod literals;
mod nonl;
mod search;
mod smart_case;
mod word_boundary;
/// Result is a convenient type alias that fixes the type of the error to
/// the `Error` type defined in this crate.
pub type Result<T> = result::Result<T, Error>;
/// Error enumerates the list of possible error conditions when building or
/// using a `Grep` line searcher.
#[derive(Debug)]
pub enum Error {
/// An error from parsing or compiling a regex.
Regex(regex::Error),
/// This error occurs when an illegal literal was found in the regex
/// pattern. For example, if the line terminator is `\n` and the regex
/// pattern is `\w+\n\w+`, then the presence of `\n` will cause this error.
LiteralNotAllowed(char),
/// An unused enum variant that indicates this enum may be expanded in
/// the future and therefore should not be exhaustively matched.
#[doc(hidden)]
__Nonexhaustive,
}
impl error::Error for Error {
fn description(&self) -> &str {
match *self {
Error::Regex(ref err) => err.description(),
Error::LiteralNotAllowed(_) => "use of forbidden literal",
Error::__Nonexhaustive => unreachable!(),
}
}
fn cause(&self) -> Option<&error::Error> {
match *self {
Error::Regex(ref err) => err.cause(),
_ => None,
}
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
Error::Regex(ref err) => err.fmt(f),
Error::LiteralNotAllowed(chr) => {
write!(f, "Literal {:?} not allowed.", chr)
}
Error::__Nonexhaustive => unreachable!(),
}
}
}
impl From<regex::Error> for Error {
fn from(err: regex::Error) -> Error {
Error::Regex(err)
}
}
#![deny(missing_docs)]
impl From<syntax::Error> for Error {
fn from(err: syntax::Error) -> Error {
Error::Regex(regex::Error::Syntax(err.to_string()))
}
}
pub extern crate grep_matcher as matcher;
pub extern crate grep_printer as printer;
pub extern crate grep_regex as regex;
pub extern crate grep_searcher as searcher;
/*!
The literals module is responsible for extracting *inner* literals out of the
AST of a regular expression. Normally this is the job of the regex engine
itself, but the regex engine doesn't look for inner literals. Since we're doing
line based searching, we can use them, so we need to do it ourselves.
Note that this implementation is incredibly suspicious. We need something more
principled.
*/
use std::cmp;
use regex::bytes::RegexBuilder;
use syntax::hir::{self, Hir, HirKind};
use syntax::hir::literal::{Literal, Literals};
#[derive(Clone, Debug)]
pub struct LiteralSets {
prefixes: Literals,
suffixes: Literals,
required: Literals,
}
impl LiteralSets {
pub fn create(expr: &Hir) -> Self {
let mut required = Literals::empty();
union_required(expr, &mut required);
LiteralSets {
prefixes: Literals::prefixes(expr),
suffixes: Literals::suffixes(expr),
required: required,
}
}
pub fn to_regex_builder(&self) -> Option<RegexBuilder> {
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
debug!("literal prefixes detected: {:?}", self.prefixes);
// When this is true, the regex engine will do a literal scan.
return None;
}
// Out of inner required literals, prefixes and suffixes, which one
// is the longest? We pick the longest to do fast literal scan under
// the assumption that a longer literal will have a lower false
// positive rate.
let pre_lcp = self.prefixes.longest_common_prefix();
let pre_lcs = self.prefixes.longest_common_suffix();
let suf_lcp = self.suffixes.longest_common_prefix();
let suf_lcs = self.suffixes.longest_common_suffix();
let req_lits = self.required.literals();
let req = match req_lits.iter().max_by_key(|lit| lit.len()) {
None => &[],
Some(req) => &***req,
};
let mut lit = pre_lcp;
if pre_lcs.len() > lit.len() {
lit = pre_lcs;
}
if suf_lcp.len() > lit.len() {
lit = suf_lcp;
}
if suf_lcs.len() > lit.len() {
lit = suf_lcs;
}
if req_lits.len() == 1 && req.len() > lit.len() {
lit = req;
}
// Special case: if we have any literals that are all whitespace,
// then this is probably a failing of the literal detection since
// whitespace is typically pretty common. In this case, don't bother
// with inner literal scanning at all and just defer to the regex.
let any_all_white = req_lits.iter()
.any(|lit| lit.iter().all(|&b| (b as char).is_whitespace()));
if any_all_white {
return None;
}
// Special case: if we detected an alternation of inner required
// literals and its longest literal is bigger than the longest
// prefix/suffix, then choose the alternation. In practice, this
// helps with case insensitive matching, which can generate lots of
// inner required literals.
let any_empty = req_lits.iter().any(|lit| lit.is_empty());
if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
debug!("required literals found: {:?}", req_lits);
let alts: Vec<String> =
req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
let mut builder = RegexBuilder::new(&alts.join("|"));
builder.unicode(false);
Some(builder)
} else if lit.is_empty() {
None
} else {
debug!("required literal found: {:?}", show(lit));
let mut builder = RegexBuilder::new(&bytes_to_regex(&lit));
builder.unicode(false);
Some(builder)
}
}
}
fn union_required(expr: &Hir, lits: &mut Literals) {
match *expr.kind() {
HirKind::Literal(hir::Literal::Unicode(c)) => {
let mut buf = [0u8; 4];
lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
}
HirKind::Literal(hir::Literal::Byte(b)) => {
lits.cross_add(&[b]);
}
HirKind::Class(hir::Class::Unicode(ref cls)) => {
if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) {
lits.cut();
}
}
HirKind::Class(hir::Class::Bytes(ref cls)) => {
if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) {
lits.cut();
}
}
HirKind::Group(hir::Group { ref hir, .. }) => {
union_required(&**hir, lits);
}
HirKind::Repetition(ref x) => {
match x.kind {
hir::RepetitionKind::ZeroOrOne => lits.cut(),
hir::RepetitionKind::ZeroOrMore => lits.cut(),
hir::RepetitionKind::OneOrMore => {
union_required(&x.hir, lits);
lits.cut();
}
hir::RepetitionKind::Range(ref rng) => {
let (min, max) = match *rng {
hir::RepetitionRange::Exactly(m) => (m, Some(m)),
hir::RepetitionRange::AtLeast(m) => (m, None),
hir::RepetitionRange::Bounded(m, n) => (m, Some(n)),
};
repeat_range_literals(
&x.hir, min, max, x.greedy, lits, union_required);
}
}
}
HirKind::Concat(ref es) if es.is_empty() => {}
HirKind::Concat(ref es) if es.len() == 1 => {
union_required(&es[0], lits)
}
HirKind::Concat(ref es) => {
for e in es {
let mut lits2 = lits.to_empty();
union_required(e, &mut lits2);
if lits2.is_empty() {
lits.cut();
continue;
}
if lits2.contains_empty() {
lits.cut();
}
if !lits.cross_product(&lits2) {
// If this expression couldn't yield any literal that
// could be extended, then we need to quit. Since we're
// short-circuiting, we also need to freeze every member.
lits.cut();
break;
}
}
}
HirKind::Alternation(ref es) => {
alternate_literals(es, lits, union_required);
}
_ => lits.cut(),
}
}
fn repeat_range_literals<F: FnMut(&Hir, &mut Literals)>(
e: &Hir,
min: u32,
max: Option<u32>,
_greedy: bool,
lits: &mut Literals,
mut f: F,
) {
if min == 0 {
// This is a bit conservative. If `max` is set, then we could
// treat this as a finite set of alternations. For now, we
// just treat it as `e*`.
lits.cut();
} else {
let n = cmp::min(lits.limit_size(), min as usize);
// We only extract literals from a single repetition, even though
// we could do more. e.g., `a{3}` will have `a` extracted instead of
// `aaa`. The reason is that inner literal extraction can't be unioned
// across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}`
// is wrong.
f(e, lits);
if n < min as usize {
lits.cut();
}
if max.map_or(true, |max| min < max) {
lits.cut();
}
}
}
fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
es: &[Hir],
lits: &mut Literals,
mut f: F,
) {
let mut lits2 = lits.to_empty();
for e in es {
let mut lits3 = lits.to_empty();
lits3.set_limit_size(lits.limit_size() / 5);
f(e, &mut lits3);
if lits3.is_empty() || !lits2.union(lits3) {
// If we couldn't find suffixes for *any* of the
// alternates, then the entire alternation has to be thrown
// away and any existing members must be frozen. Similarly,
// if the union couldn't complete, stop and freeze.
lits.cut();
return;
}
}
// All we do at the moment is look for prefixes and suffixes. If both
// are empty, then we report nothing. We should be able to do better than
// this, but we'll need something more expressive than just a "set of
// literals."
let lcp = lits2.longest_common_prefix();
let lcs = lits2.longest_common_suffix();
if !lcp.is_empty() {
lits.cross_add(lcp);
}
lits.cut();
if !lcs.is_empty() {
lits.add(Literal::empty());
lits.add(Literal::new(lcs.to_vec()));
}
}
/// Return the number of characters in the given class.
fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
}
/// Return the number of bytes in the given class.
fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
}
/// Converts an arbitrary sequence of bytes to a literal suitable for building
/// a regular expression.
fn bytes_to_regex(bs: &[u8]) -> String {
let mut s = String::with_capacity(bs.len());
for &b in bs {
s.push_str(&format!("\\x{:02x}", b));
}
s
}
/// Converts arbitrary bytes to a nice string.
fn show(bs: &[u8]) -> String {
// Why aren't we using this to feed to the regex? Doesn't really matter
// I guess. ---AG
use std::ascii::escape_default;
use std::str;
let mut nice = String::new();
for &b in bs {
let part: Vec<u8> = escape_default(b).collect();
nice.push_str(str::from_utf8(&part).unwrap());
}
nice
}
use syntax::hir::{self, Hir, HirKind};
use {Error, Result};
/// Returns a new expression that is guaranteed to never match the given
/// ASCII character.
///
/// If the expression contains the literal byte, then an error is returned.
///
/// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this
/// function panics.
pub fn remove(expr: Hir, byte: u8) -> Result<Hir> {
assert!(byte <= 0x7F);
let chr = byte as char;
assert!(chr.len_utf8() == 1);
Ok(match expr.into_kind() {
HirKind::Empty => Hir::empty(),
HirKind::Literal(hir::Literal::Unicode(c)) => {
if c == chr {
return Err(Error::LiteralNotAllowed(chr));
}
Hir::literal(hir::Literal::Unicode(c))
}
HirKind::Literal(hir::Literal::Byte(b)) => {
if b as char == chr {
return Err(Error::LiteralNotAllowed(chr));
}
Hir::literal(hir::Literal::Byte(b))
}
HirKind::Class(hir::Class::Unicode(mut cls)) => {
let remove = hir::ClassUnicode::new(Some(
hir::ClassUnicodeRange::new(chr, chr),
));
cls.difference(&remove);
if cls.iter().next().is_none() {
return Err(Error::LiteralNotAllowed(chr));
}
Hir::class(hir::Class::Unicode(cls))
}
HirKind::Class(hir::Class::Bytes(mut cls)) => {
let remove = hir::ClassBytes::new(Some(
hir::ClassBytesRange::new(byte, byte),
));
cls.difference(&remove);
if cls.iter().next().is_none() {
return Err(Error::LiteralNotAllowed(chr));
}
Hir::class(hir::Class::Bytes(cls))
}
HirKind::Anchor(x) => Hir::anchor(x),
HirKind::WordBoundary(x) => Hir::word_boundary(x),
HirKind::Repetition(mut x) => {
x.hir = Box::new(remove(*x.hir, byte)?);
Hir::repetition(x)
}
HirKind::Group(mut x) => {
x.hir = Box::new(remove(*x.hir, byte)?);
Hir::group(x)
}
HirKind::Concat(xs) => {
let xs = xs.into_iter()
.map(|e| remove(e, byte))
.collect::<Result<Vec<Hir>>>()?;
Hir::concat(xs)
}
HirKind::Alternation(xs) => {
let xs = xs.into_iter()
.map(|e| remove(e, byte))
.collect::<Result<Vec<Hir>>>()?;
Hir::alternation(xs)
}
})
}
use memchr::{memchr, memrchr};
use syntax::ParserBuilder;
use syntax::hir::Hir;
use regex::bytes::{Regex, RegexBuilder};
use literals::LiteralSets;
use nonl;
use smart_case::Cased;
use word_boundary::strip_unicode_word_boundaries;
use Result;
/// A matched line.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct Match {
start: usize,
end: usize,
}
impl Match {
/// Create a new empty match value.
pub fn new() -> Match {
Match::default()
}
/// Return the starting byte offset of the line that matched.
#[inline]
pub fn start(&self) -> usize {
self.start
}
/// Return the ending byte offset of the line that matched.
#[inline]
pub fn end(&self) -> usize {
self.end
}
}
/// A fast line oriented regex searcher.
#[derive(Clone, Debug)]
pub struct Grep {
re: Regex,
required: Option<Regex>,
opts: Options,
}
/// A builder for a grep searcher.
#[derive(Clone, Debug)]
pub struct GrepBuilder {
pattern: String,
opts: Options,
}
#[derive(Clone, Debug)]
struct Options {
case_insensitive: bool,
case_smart: bool,
line_terminator: u8,
size_limit: usize,
dfa_size_limit: usize,
}
impl Default for Options {
fn default() -> Options {
Options {
case_insensitive: false,
case_smart: false,
line_terminator: b'\n',
size_limit: 10 * (1 << 20),
dfa_size_limit: 10 * (1 << 20),
}
}
}
impl GrepBuilder {
/// Create a new builder for line searching.
///
/// The pattern given should be a regular expression. The precise syntax
/// supported is documented on the regex crate.
pub fn new(pattern: &str) -> GrepBuilder {
GrepBuilder {
pattern: pattern.to_string(),
opts: Options::default(),
}
}
/// Set the line terminator.
///
/// The line terminator can be any ASCII character and serves to delineate
/// the match boundaries in the text searched.
///
/// This panics if `ascii_byte` is greater than `0x7F` (i.e., not ASCII).
pub fn line_terminator(mut self, ascii_byte: u8) -> GrepBuilder {
assert!(ascii_byte <= 0x7F);
self.opts.line_terminator = ascii_byte;
self
}
/// Set the case sensitive flag (`i`) on the regex.
pub fn case_insensitive(mut self, yes: bool) -> GrepBuilder {
self.opts.case_insensitive = yes;
self
}
/// Whether to enable smart case search or not (disabled by default).
///
/// Smart case uses case insensitive search if the pattern contains only
/// lowercase characters (ignoring any characters which immediately follow
/// a '\'). Otherwise, a case sensitive search is used instead.
///
/// Enabling the case_insensitive flag overrides this.
pub fn case_smart(mut self, yes: bool) -> GrepBuilder {
self.opts.case_smart = yes;
self
}
/// Set the approximate size limit of the compiled regular expression.
///
/// This roughly corresponds to the number of bytes occupied by a
/// single compiled program. If the program exceeds this number, then a
/// compilation error is returned.
pub fn size_limit(mut self, limit: usize) -> GrepBuilder {
self.opts.size_limit = limit;
self
}
/// Set the approximate size of the cache used by the DFA.
///
/// This roughly corresponds to the number of bytes that the DFA will use
/// while searching.
///
/// Note that this is a per thread limit. There is no way to set a global
/// limit. In particular, if a regex is used from multiple threads
/// simulanteously, then each thread may use up to the number of bytes
/// specified here.
pub fn dfa_size_limit(mut self, limit: usize) -> GrepBuilder {
self.opts.dfa_size_limit = limit;
self
}
/// Create a line searcher.
///
/// If there was a problem parsing or compiling the regex with the given
/// options, then an error is returned.
pub fn build(self) -> Result<Grep> {
let expr = self.parse()?;
let literals = LiteralSets::create(&expr);
let re = self.regex(&expr)?;
let required = match literals.to_regex_builder() {
Some(builder) => Some(self.regex_build(builder)?),
None => {
match strip_unicode_word_boundaries(&expr) {
None => None,
Some(expr) => {
debug!("Stripped Unicode word boundaries. \
New AST:\n{:?}", expr);
self.regex(&expr).ok()
}
}
}
};
Ok(Grep {
re: re,
required: required,
opts: self.opts,
})
}
/// Creates a new regex from the given expression with the current
/// configuration.
fn regex(&self, expr: &Hir) -> Result<Regex> {
let mut builder = RegexBuilder::new(&expr.to_string());
builder.unicode(true);
self.regex_build(builder)
}
/// Builds a new regex from the given builder using the caller's settings.
fn regex_build(&self, mut builder: RegexBuilder) -> Result<Regex> {
builder
.multi_line(true)
.size_limit(self.opts.size_limit)
.dfa_size_limit(self.opts.dfa_size_limit)
.build()
.map_err(From::from)
}
/// Parses the underlying pattern and ensures the pattern can never match
/// the line terminator.
fn parse(&self) -> Result<Hir> {
let expr = ParserBuilder::new()
.allow_invalid_utf8(true)
.case_insensitive(self.is_case_insensitive()?)
.multi_line(true)
.build()
.parse(&self.pattern)?;
debug!("original regex HIR pattern:\n{}", expr);
let expr = nonl::remove(expr, self.opts.line_terminator)?;
debug!("transformed regex HIR pattern:\n{}", expr);
Ok(expr)
}
/// Determines whether the case insensitive flag should be enabled or not.
fn is_case_insensitive(&self) -> Result<bool> {
if self.opts.case_insensitive {
return Ok(true);
}
if !self.opts.case_smart {
return Ok(false);
}
let cased = match Cased::from_pattern(&self.pattern) {
None => return Ok(false),
Some(cased) => cased,
};
Ok(cased.any_literal && !cased.any_uppercase)
}
}
impl Grep {
/// Returns a reference to the underlying regex used by the searcher.
pub fn regex(&self) -> &Regex {
&self.re
}
/// Returns an iterator over all matches in the given buffer.
pub fn iter<'b, 's>(&'s self, buf: &'b [u8]) -> Iter<'b, 's> {
Iter {
searcher: self,
buf: buf,
start: 0,
}
}
/// Fills in the next line that matches in the given buffer starting at
/// the position given.
///
/// If no match could be found, `false` is returned, otherwise, `true` is
/// returned.
pub fn read_match(
&self,
mat: &mut Match,
buf: &[u8],
mut start: usize,
) -> bool {
if start >= buf.len() {
return false;
}
if let Some(ref req) = self.required {
while start < buf.len() {
let e = match req.shortest_match(&buf[start..]) {
None => return false,
Some(e) => start + e,
};
let (prevnl, nextnl) = self.find_line(buf, e, e);
match self.re.shortest_match(&buf[prevnl..nextnl]) {
None => {
start = nextnl;
continue;
}
Some(_) => {
self.fill_match(mat, prevnl, nextnl);
return true;
}
}
}
false
} else {
let e = match self.re.shortest_match(&buf[start..]) {
None => return false,
Some(e) => start + e,
};
let (s, e) = self.find_line(buf, e, e);
self.fill_match(mat, s, e);
true
}
}
fn fill_match(&self, mat: &mut Match, start: usize, end: usize) {
mat.start = start;
mat.end = end;
}
fn find_line(&self, buf: &[u8], s: usize, e: usize) -> (usize, usize) {
(self.find_line_start(buf, s), self.find_line_end(buf, e))
}
fn find_line_start(&self, buf: &[u8], pos: usize) -> usize {
memrchr(self.opts.line_terminator, &buf[0..pos]).map_or(0, |i| i + 1)
}
fn find_line_end(&self, buf: &[u8], pos: usize) -> usize {
memchr(self.opts.line_terminator, &buf[pos..])
.map_or(buf.len(), |i| pos + i + 1)
}
}
/// An iterator over all matches in a particular buffer.
///
/// `'b` refers to the lifetime of the buffer, and `'s` refers to the lifetime
/// of the searcher.
pub struct Iter<'b, 's> {
searcher: &'s Grep,
buf: &'b [u8],
start: usize,
}
impl<'b, 's> Iterator for Iter<'b, 's> {
type Item = Match;
fn next(&mut self) -> Option<Match> {
let mut mat = Match::default();
if !self.searcher.read_match(&mut mat, self.buf, self.start) {
self.start = self.buf.len();
return None;
}
self.start = mat.end;
Some(mat)
}
}
#[cfg(test)]
mod tests {
use memchr::{memchr, memrchr};
use regex::bytes::Regex;
use super::{GrepBuilder, Match};
static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
fn find_lines(pat: &str, haystack: &[u8]) -> Vec<Match> {
let re = Regex::new(pat).unwrap();
let mut lines = vec![];
for m in re.find_iter(haystack) {
let start = memrchr(b'\n', &haystack[..m.start()])
.map_or(0, |i| i + 1);
let end = memchr(b'\n', &haystack[m.end()..])
.map_or(haystack.len(), |i| m.end() + i + 1);
lines.push(Match {
start: start,
end: end,
});
}
lines
}
fn grep_lines(pat: &str, haystack: &[u8]) -> Vec<Match> {
let g = GrepBuilder::new(pat).build().unwrap();
g.iter(haystack).collect()
}
#[test]
fn buffered_literal() {
let expected = find_lines("Sherlock Holmes", SHERLOCK);
let got = grep_lines("Sherlock Holmes", SHERLOCK);
assert_eq!(expected.len(), got.len());
assert_eq!(expected, got);
}
}
use syntax::ast::{self, Ast};
use syntax::ast::parse::Parser;
/// The results of analyzing a regex for cased literals.
#[derive(Clone, Debug, Default)]
pub struct Cased {
/// True if and only if a literal uppercase character occurs in the regex.
///
/// A regex like `\pL` contains no uppercase literals, even though `L`
/// is uppercase and the `\pL` class contains uppercase characters.
pub any_uppercase: bool,
/// True if and only if the regex contains any literal at all. A regex like
/// `\pL` has this set to false.
pub any_literal: bool,
}
impl Cased {
/// Returns a `Cased` value by doing analysis on the AST of `pattern`.
///
/// If `pattern` is not a valid regular expression, then `None` is
/// returned.
pub fn from_pattern(pattern: &str) -> Option<Cased> {
Parser::new()
.parse(pattern)
.map(|ast| Cased::from_ast(&ast))
.ok()
}
fn from_ast(ast: &Ast) -> Cased {
let mut cased = Cased::default();
cased.from_ast_impl(ast);
cased
}
fn from_ast_impl(&mut self, ast: &Ast) {
if self.done() {
return;
}
match *ast {
Ast::Empty(_)
| Ast::Flags(_)
| Ast::Dot(_)
| Ast::Assertion(_)
| Ast::Class(ast::Class::Unicode(_))
| Ast::Class(ast::Class::Perl(_)) => {}
Ast::Literal(ref x) => {
self.from_ast_literal(x);
}
Ast::Class(ast::Class::Bracketed(ref x)) => {
self.from_ast_class_set(&x.kind);
}
Ast::Repetition(ref x) => {
self.from_ast_impl(&x.ast);
}
Ast::Group(ref x) => {
self.from_ast_impl(&x.ast);
}
Ast::Alternation(ref alt) => {
for x in &alt.asts {
self.from_ast_impl(x);
}
}
Ast::Concat(ref alt) => {
for x in &alt.asts {
self.from_ast_impl(x);
}
}
}
}
fn from_ast_class_set(&mut self, ast: &ast::ClassSet) {
if self.done() {
return;
}
match *ast {
ast::ClassSet::Item(ref item) => {
self.from_ast_class_set_item(item);
}
ast::ClassSet::BinaryOp(ref x) => {
self.from_ast_class_set(&x.lhs);
self.from_ast_class_set(&x.rhs);
}
}
}
fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) {
if self.done() {
return;
}
match *ast {
ast::ClassSetItem::Empty(_)
| ast::ClassSetItem::Ascii(_)
| ast::ClassSetItem::Unicode(_)
| ast::ClassSetItem::Perl(_) => {}
ast::ClassSetItem::Literal(ref x) => {
self.from_ast_literal(x);
}
ast::ClassSetItem::Range(ref x) => {
self.from_ast_literal(&x.start);
self.from_ast_literal(&x.end);
}
ast::ClassSetItem::Bracketed(ref x) => {
self.from_ast_class_set(&x.kind);
}
ast::ClassSetItem::Union(ref union) => {
for x in &union.items {
self.from_ast_class_set_item(x);
}
}
}
}
fn from_ast_literal(&mut self, ast: &ast::Literal) {
self.any_literal = true;
self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
}
/// Returns true if and only if the attributes can never change no matter
/// what other AST it might see.
fn done(&self) -> bool {
self.any_uppercase && self.any_literal
}
}
#[cfg(test)]
mod tests {
use super::*;
fn cased(pattern: &str) -> Cased {
Cased::from_pattern(pattern).unwrap()
}
#[test]
fn various() {
let x = cased("");
assert!(!x.any_uppercase);
assert!(!x.any_literal);
let x = cased("foo");
assert!(!x.any_uppercase);
assert!(x.any_literal);
let x = cased("Foo");
assert!(x.any_uppercase);
assert!(x.any_literal);
let x = cased("foO");
assert!(x.any_uppercase);
assert!(x.any_literal);
let x = cased(r"foo\\");
assert!(!x.any_uppercase);
assert!(x.any_literal);
let x = cased(r"foo\w");
assert!(!x.any_uppercase);
assert!(x.any_literal);
let x = cased(r"foo\S");
assert!(!x.any_uppercase);
assert!(x.any_literal);
let x = cased(r"foo\p{Ll}");
assert!(!x.any_uppercase);
assert!(x.any_literal);
let x = cased(r"foo[a-z]");
assert!(!x.any_uppercase);
assert!(x.any_literal);
let x = cased(r"foo[A-Z]");
assert!(x.any_uppercase);
assert!(x.any_literal);
let x = cased(r"foo[\S\t]");
assert!(!x.any_uppercase);
assert!(x.any_literal);
let x = cased(r"foo\\S");
assert!(x.any_uppercase);
assert!(x.any_literal);
let x = cased(r"\p{Ll}");
assert!(!x.any_uppercase);
assert!(!x.any_literal);
let x = cased(r"aBc\w");
assert!(x.any_uppercase);
assert!(x.any_literal);
}
}
use syntax::hir::{self, Hir, HirKind};
/// Strips Unicode word boundaries from the given expression.
///
/// The key invariant this maintains is that the expression returned will match
/// *at least* every where the expression given will match. Namely, a match of
/// the returned expression can report false positives but it will never report
/// false negatives.
///
/// If no word boundaries could be stripped, then None is returned.
pub fn strip_unicode_word_boundaries(expr: &Hir) -> Option<Hir> {
// The real reason we do this is because Unicode word boundaries are the
// one thing that Rust's regex DFA engine can't handle. When it sees a
// Unicode word boundary among non-ASCII text, it falls back to one of the
// slower engines. We work around this limitation by attempting to use
// a regex to find candidate matches without a Unicode word boundary. We'll
// only then use the full (and slower) regex to confirm a candidate as a
// match or not during search.
//
// It looks like we only check the outer edges for `\b`? I guess this is
// an attempt to optimize for the `-w/--word-regexp` flag? ---AG
match *expr.kind() {
HirKind::Concat(ref es) if !es.is_empty() => {
let first = is_unicode_word_boundary(&es[0]);
let last = is_unicode_word_boundary(es.last().unwrap());
// Be careful not to strip word boundaries if there are no other
// expressions to match.
match (first, last) {
(true, false) if es.len() > 1 => {
Some(Hir::concat(es[1..].to_vec()))
}
(false, true) if es.len() > 1 => {
Some(Hir::concat(es[..es.len() - 1].to_vec()))
}
(true, true) if es.len() > 2 => {
Some(Hir::concat(es[1..es.len() - 1].to_vec()))
}
_ => None,
}
}
_ => None,
}
}
/// Returns true if the given expression is a Unicode word boundary.
fn is_unicode_word_boundary(expr: &Hir) -> bool {
match *expr.kind() {
HirKind::WordBoundary(hir::WordBoundary::Unicode) => true,
HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => true,
HirKind::Group(ref x) => is_unicode_word_boundary(&x.hir),
_ => false,
}
}
This project is dual-licensed under the Unlicense and MIT licenses.
You may use this code under the terms of either license.
[package]
name = "grep2"
version = "0.2.0" #:version
authors = ["Andrew Gallant <jamslam@gmail.com>"]
description = """
Fast line oriented regex searching as a library.
"""
documentation = "http://burntsushi.net/rustdoc/grep/"
homepage = "https://github.com/BurntSushi/ripgrep"
repository = "https://github.com/BurntSushi/ripgrep"
readme = "README.md"
keywords = ["regex", "grep", "egrep", "search", "pattern"]
license = "Unlicense/MIT"
[dependencies]
grep-matcher = { version = "0.0.1", path = "../grep-matcher" }
grep-printer = { version = "0.0.1", path = "../grep-printer" }
grep-regex = { version = "0.0.1", path = "../grep-regex" }
grep-searcher = { version = "0.0.1", path = "../grep-searcher" }
[features]
avx-accel = ["grep-searcher/avx-accel"]
simd-accel = ["grep-searcher/simd-accel"]
The MIT License (MIT)
Copyright (c) 2015 Andrew Gallant
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
grep
----
This is a *library* that provides grep-style line-by-line regex searching (with
comparable performance to `grep` itself).
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>
/*!
TODO.
*/
#![deny(missing_docs)]
pub extern crate grep_matcher as matcher;
pub extern crate grep_printer as printer;
pub extern crate grep_regex as regex;
pub extern crate grep_searcher as searcher;
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
use std::io;
use std::path::Path;
use grep2::printer::{ColorSpecs, PrinterPath};
use grep::printer::{ColorSpecs, PrinterPath};
use termcolor::WriteColor;
/// A configuration for describing how paths should be written.
......
/*!
The pathutil module provides platform specific operations on paths that are
typically faster than the same operations as provided in `std::path`. In
particular, we really want to avoid the costly operation of parsing the path
into its constituent components. We give up on Windows, but on Unix, we deal
with the raw bytes directly.
*/
use std::path::Path;
/// Strip `prefix` from the `path` and return the remainder.
///
/// If `path` doesn't have a prefix `prefix`, then return `None`.
#[cfg(unix)]
pub fn strip_prefix<'a, P: AsRef<Path> + ?Sized>(
prefix: &'a P,
path: &'a Path,
) -> Option<&'a Path> {
use std::ffi::OsStr;
use std::os::unix::ffi::OsStrExt;
let prefix = prefix.as_ref().as_os_str().as_bytes();
let path = path.as_os_str().as_bytes();
if prefix.len() > path.len() || prefix != &path[0..prefix.len()] {
None
} else {
Some(Path::new(OsStr::from_bytes(&path[prefix.len()..])))
}
}
/// Strip `prefix` from the `path` and return the remainder.
///
/// If `path` doesn't have a prefix `prefix`, then return `None`.
#[cfg(not(unix))]
pub fn strip_prefix<'a, P: AsRef<Path> + ?Sized>(
prefix: &'a P,
path: &'a Path,
) -> Option<&'a Path> {
path.strip_prefix(prefix).ok()
}
此差异已折叠。
......@@ -2,10 +2,10 @@ use std::io;
use std::path::{Path, PathBuf};
use std::time::Duration;
use grep2::matcher::Matcher;
use grep2::printer::{JSON, Standard, Summary, Stats};
use grep2::regex::RegexMatcher;
use grep2::searcher::Searcher;
use grep::matcher::Matcher;
use grep::printer::{JSON, Standard, Summary, Stats};
use grep::regex::RegexMatcher;
use grep::searcher::Searcher;
use termcolor::WriteColor;
use decompressor::{DecompressionReader, is_compressed};
......@@ -95,7 +95,6 @@ impl SearchWorkerBuilder {
#[derive(Clone, Debug, Default)]
pub struct SearchResult {
has_match: bool,
binary_byte_offset: Option<u64>,
stats: Option<Stats>,
}
......@@ -105,15 +104,6 @@ impl SearchResult {
self.has_match
}
/// Whether the search found binary data, and if so, the first absolute
/// byte offset at which it was detected.
///
/// This always returns `None` if binary data detection is disabled, even
/// when binary data is present.
pub fn binary_byte_offset(&self) -> Option<u64> {
self.binary_byte_offset
}
/// Return aggregate search statistics for a single search, if available.
///
/// It can be expensive to compute statistics, so these are only present
......@@ -168,10 +158,8 @@ impl<W: WriteColor> Printer<W> {
total_duration: Duration,
stats: &Stats,
) -> io::Result<()> {
let mut wtr = self.get_mut();
write!(
wtr,
self.get_mut(),
"
{matches} matches
{lines} matched lines
......@@ -295,9 +283,7 @@ fn search_path<M: Matcher, W: WriteColor>(
searcher.search_path(&matcher, path, &mut sink)?;
Ok(SearchResult {
has_match: sink.has_match(),
binary_byte_offset: sink.binary_byte_offset(),
stats: sink.stats().map(|s| s.clone()),
..SearchResult::default()
})
}
Printer::Summary(ref mut p) => {
......@@ -305,9 +291,7 @@ fn search_path<M: Matcher, W: WriteColor>(
searcher.search_path(&matcher, path, &mut sink)?;
Ok(SearchResult {
has_match: sink.has_match(),
binary_byte_offset: sink.binary_byte_offset(),
stats: sink.stats().map(|s| s.clone()),
..SearchResult::default()
})
}
Printer::JSON(ref mut p) => {
......@@ -315,9 +299,7 @@ fn search_path<M: Matcher, W: WriteColor>(
searcher.search_path(&matcher, path, &mut sink)?;
Ok(SearchResult {
has_match: sink.has_match(),
binary_byte_offset: sink.binary_byte_offset(),
stats: Some(sink.stats().clone()),
..SearchResult::default()
})
}
}
......@@ -338,9 +320,7 @@ fn search_reader<M: Matcher, R: io::Read, W: WriteColor>(
searcher.search_reader(&matcher, rdr, &mut sink)?;
Ok(SearchResult {
has_match: sink.has_match(),
binary_byte_offset: sink.binary_byte_offset(),
stats: sink.stats().map(|s| s.clone()),
..SearchResult::default()
})
}
Printer::Summary(ref mut p) => {
......@@ -348,9 +328,7 @@ fn search_reader<M: Matcher, R: io::Read, W: WriteColor>(
searcher.search_reader(&matcher, rdr, &mut sink)?;
Ok(SearchResult {
has_match: sink.has_match(),
binary_byte_offset: sink.binary_byte_offset(),
stats: sink.stats().map(|s| s.clone()),
..SearchResult::default()
})
}
Printer::JSON(ref mut p) => {
......@@ -358,9 +336,7 @@ fn search_reader<M: Matcher, R: io::Read, W: WriteColor>(
searcher.search_reader(&matcher, rdr, &mut sink)?;
Ok(SearchResult {
has_match: sink.has_match(),
binary_byte_offset: sink.binary_byte_offset(),
stats: Some(sink.stats().clone()),
..SearchResult::default()
})
}
}
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册