提交 0473df1e 编写于 作者: A Andrew Gallant

Disable Unicode mode for literal regex.

When ripgrep detects a literal, it emits them as raw hex escaped byte
sequences to Regex::new. This permits literal optimizations for arbitrary
byte sequences (i.e., possibly invalid UTF-8). The problem is that
Regex::new interprets hex escaped byte sequences as *Unicode codepoints*
by default, but we want them to actually stand for their raw byte values.
Therefore, disable Unicode mode.

This is OK, since the regex is composed entirely of literals and literal
extraction does Unicode case folding.

Fixes #251
上级 301a3fd7
......@@ -79,12 +79,12 @@ impl LiteralSets {
debug!("required literals found: {:?}", req_lits);
let alts: Vec<String> =
req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
Some(RegexBuilder::new(&alts.join("|")))
Some(RegexBuilder::new(&alts.join("|")).unicode(false))
} else if lit.is_empty() {
None
} else {
debug!("required literal found: {:?}", show(lit));
Some(RegexBuilder::new(&bytes_to_regex(lit)))
Some(RegexBuilder::new(&bytes_to_regex(&lit)).unicode(false))
}
}
}
......
......@@ -167,14 +167,13 @@ impl GrepBuilder {
/// Creates a new regex from the given expression with the current
/// configuration.
fn regex(&self, expr: &Expr) -> Result<Regex> {
self.regex_build(RegexBuilder::new(&expr.to_string()))
self.regex_build(RegexBuilder::new(&expr.to_string()).unicode(true))
}
/// Builds a new regex from the given builder using the caller's settings.
fn regex_build(&self, builder: RegexBuilder) -> Result<Regex> {
builder
.multi_line(true)
.unicode(true)
.size_limit(self.opts.size_limit)
.dfa_size_limit(self.opts.dfa_size_limit)
.compile()
......
......@@ -936,6 +936,15 @@ clean!(regression_229, "[E]conomie", ".", |wd: WorkDir, mut cmd: Command| {
wd.assert_err(&mut cmd);
});
// See: https://github.com/BurntSushi/ripgrep/issues/251
clean!(regression_251, "привет", ".", |wd: WorkDir, mut cmd: Command| {
wd.create("foo", "привет\nПривет\nПрИвЕт");
cmd.arg("-i");
let lines: String = wd.stdout(&mut cmd);
assert_eq!(lines, "foo:привет\nfoo:Привет\nfoo:ПрИвЕт\n");
});
// See: https://github.com/BurntSushi/ripgrep/issues/7
sherlock!(feature_7, "-fpat", "sherlock", |wd: WorkDir, mut cmd: Command| {
wd.create("pat", "Sherlock\nHolmes");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册