提交 993b9200 编写于 作者: I Igor Aleksanov

librustc_lexer: Enhance documentation

Apply review suggestions

Apply review suggestions
上级 084edc42
use std::str::Chars;
/// Peekable iterator over a char sequence.
///
/// Next characters can be peeked via `nth_char` method,
/// and position can be shifted forward via `bump` method.
pub(crate) struct Cursor<'a> {
initial_len: usize,
chars: Chars<'a>,
......@@ -18,7 +22,9 @@ pub(crate) fn new(input: &'a str) -> Cursor<'a> {
prev: EOF_CHAR,
}
}
/// For debug assertions only
/// Returns the last eaten symbol (or '\0' in release builds).
pub(crate) fn prev(&self) -> char {
#[cfg(debug_assertions)]
{
......@@ -30,19 +36,30 @@ pub(crate) fn prev(&self) -> char {
'\0'
}
}
/// Returns nth character relative to the current cursor position.
/// If requested position doesn't exist, `EOF_CHAR` is returned.
/// However, getting `EOF_CHAR` doesn't always mean actual end of file,
/// it should be checked with `is_eof` method.
pub(crate) fn nth_char(&self, n: usize) -> char {
self.chars().nth(n).unwrap_or(EOF_CHAR)
}
/// Checks if there is nothing more to consume.
pub(crate) fn is_eof(&self) -> bool {
self.chars.as_str().is_empty()
}
/// Returns amount of already consumed symbols.
pub(crate) fn len_consumed(&self) -> usize {
self.initial_len - self.chars.as_str().len()
}
/// Returns an iterator over the remaining characters.
/// Returns a `Chars` iterator over the remaining characters.
fn chars(&self) -> Chars<'a> {
self.chars.clone()
}
/// Moves to the next character.
pub(crate) fn bump(&mut self) -> Option<char> {
let c = self.chars.next()?;
......
//! Low-level Rust lexer.
//!
//! Tokens produced by this lexer are not yet ready for parsing the Rust syntax,
//! for that see `libsyntax::parse::lexer`, which converts this basic token stream
//! into wide tokens used by actual parser.
//!
//! The purpose of this crate is to convert raw sources into a labeled sequence
//! of well-known token types, so building an actual Rust token stream will
//! be easier.
//!
//! Main entity of this crate is [`TokenKind`] enum which represents common
//! lexeme types.
// We want to be able to build this crate with a stable compiler, so no
// `#![feature]` attributes should be added.
......@@ -6,78 +19,144 @@
use crate::cursor::{Cursor, EOF_CHAR};
/// Parsed token.
/// It doesn't contain information about data that has been parsed,
/// only the type of the token and its size.
pub struct Token {
pub kind: TokenKind,
pub len: usize,
}
impl Token {
fn new(kind: TokenKind, len: usize) -> Token {
Token { kind, len }
}
}
/// Enum represening common lexeme types.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum TokenKind {
// Multi-char tokens:
/// "// comment"
LineComment,
/// "/* block comment */"
/// Block comments can be recursive, so the sequence like "/* /* */"
/// will not be considered terminated and will result in a parsing error.
BlockComment { terminated: bool },
/// Any whitespace characters sequence.
Whitespace,
/// "ident" or "continue"
/// At this step keywords are also considered identifiers.
Ident,
/// "r#ident"
RawIdent,
/// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
Literal { kind: LiteralKind, suffix_start: usize },
/// "'a"
Lifetime { starts_with_number: bool },
// One-char tokens:
/// ";"
Semi,
/// ","
Comma,
/// "."
Dot,
/// "("
OpenParen,
/// ")"
CloseParen,
/// "{"
OpenBrace,
/// "}"
CloseBrace,
/// "["
OpenBracket,
/// "]"
CloseBracket,
/// "@"
At,
/// "#"
Pound,
/// "~"
Tilde,
/// "?"
Question,
/// ":"
Colon,
/// "$"
Dollar,
/// "="
Eq,
/// "!"
Not,
/// "<"
Lt,
/// ">"
Gt,
/// "-"
Minus,
/// "&"
And,
/// "|"
Or,
/// "+"
Plus,
/// "*"
Star,
/// "/"
Slash,
/// "^"
Caret,
/// "%"
Percent,
/// Unknown token, not expected by the lexer, e.g. "№"
Unknown,
}
use self::TokenKind::*;
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum LiteralKind {
/// "12_u8", "0o100", "0b120i99"
Int { base: Base, empty_int: bool },
/// "12.34f32", "0b100.100"
Float { base: Base, empty_exponent: bool },
/// "'a'", "'\\'", "'''", "';"
Char { terminated: bool },
/// "b'a'", "b'\\'", "b'''", "b';"
Byte { terminated: bool },
/// ""abc"", ""abc"
Str { terminated: bool },
/// "b"abc"", "b"abc"
ByteStr { terminated: bool },
/// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
RawStr { n_hashes: usize, started: bool, terminated: bool },
/// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
RawByteStr { n_hashes: usize, started: bool, terminated: bool },
}
use self::LiteralKind::*;
/// Base of numeric literal encoding according to its prefix.
#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum Base {
/// Literal starts with "0b".
Binary,
/// Literal starts with "0o".
Octal,
/// Literal starts with "0x".
Hexadecimal,
/// Literal doesn't contain a prefix.
Decimal,
}
impl Token {
fn new(kind: TokenKind, len: usize) -> Token {
Token { kind, len }
}
}
/// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun",
/// but shebang isn't a part of rust syntax, so this function
/// skips the line if it starts with a shebang ("#!").
/// Line won't be skipped if it represents a valid Rust syntax
/// (e.g. "#![deny(missing_docs)]").
pub fn strip_shebang(input: &str) -> Option<usize> {
debug_assert!(!input.is_empty());
if !input.starts_with("#!") || input.starts_with("#![") {
......@@ -86,11 +165,13 @@ pub fn strip_shebang(input: &str) -> Option<usize> {
Some(input.find('\n').unwrap_or(input.len()))
}
/// Parses the first token from the provided input string.
pub fn first_token(input: &str) -> Token {
debug_assert!(!input.is_empty());
Cursor::new(input).advance_token()
}
/// Creates an iterator that produces tokens from the input string.
pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
std::iter::from_fn(move || {
if input.is_empty() {
......@@ -102,10 +183,9 @@ pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
})
}
// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
// classes.
/// True if `c` is considered a whitespace according to Rust language definition.
/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
/// for definitions of these classes.
pub fn is_whitespace(c: char) -> bool {
// This is Pattern_White_Space.
//
......@@ -137,6 +217,8 @@ pub fn is_whitespace(c: char) -> bool {
}
/// True if `c` is valid as a first character of an identifier.
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
/// a formal definition of valid identifier name.
pub fn is_id_start(c: char) -> bool {
// This is XID_Start OR '_' (which formally is not a XID_Start).
// We also add fast-path for ascii idents
......@@ -147,6 +229,8 @@ pub fn is_id_start(c: char) -> bool {
}
/// True if `c` is valid as a non-first character of an identifier.
/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
/// a formal definition of valid identifier name.
pub fn is_id_continue(c: char) -> bool {
// This is exactly XID_Continue.
// We also add fast-path for ascii idents
......@@ -159,15 +243,21 @@ pub fn is_id_continue(c: char) -> bool {
impl Cursor<'_> {
/// Parses a token from the input string.
fn advance_token(&mut self) -> Token {
let first_char = self.bump().unwrap();
let token_kind = match first_char {
// Slash, comment or block comment.
'/' => match self.nth_char(0) {
'/' => self.line_comment(),
'*' => self.block_comment(),
_ => Slash,
},
// Whitespace sequence.
c if is_whitespace(c) => self.whitespace(),
// Raw string literal or identifier.
'r' => match (self.nth_char(0), self.nth_char(1)) {
('#', c1) if is_id_start(c1) => self.raw_ident(),
('#', _) | ('"', _) => {
......@@ -181,6 +271,8 @@ fn advance_token(&mut self) -> Token {
}
_ => self.ident(),
},
// Byte literal, byte string literal, raw byte string literal or identifier.
'b' => match (self.nth_char(0), self.nth_char(1)) {
('\'', _) => {
self.bump();
......@@ -214,13 +306,20 @@ fn advance_token(&mut self) -> Token {
}
_ => self.ident(),
},
// Identifier (this should be checked after other variant that can
// start as identifier).
c if is_id_start(c) => self.ident(),
// Numeric literal.
c @ '0'..='9' => {
let literal_kind = self.number(c);
let suffix_start = self.len_consumed();
self.eat_literal_suffix();
TokenKind::Literal { kind: literal_kind, suffix_start }
}
// One-symbol tokens.
';' => Semi,
',' => Comma,
'.' => Dot,
......@@ -247,7 +346,11 @@ fn advance_token(&mut self) -> Token {
'*' => Star,
'^' => Caret,
'%' => Percent,
// Lifetime or character literal.
'\'' => self.lifetime_or_char(),
// String literal.
'"' => {
let terminated = self.double_quoted_string();
let suffix_start = self.len_consumed();
......@@ -291,6 +394,9 @@ fn block_comment(&mut self) -> TokenKind {
self.bump();
depth -= 1;
if depth == 0 {
// This block comment is closed, so for a construction like "/* */ */"
// there will be a successfully parsed block comment "/* */"
// and " */" will be processed separately.
break;
}
}
......@@ -335,6 +441,7 @@ fn number(&mut self, first_digit: char) -> LiteralKind {
debug_assert!('0' <= self.prev() && self.prev() <= '9');
let mut base = Base::Decimal;
if first_digit == '0' {
// Attempt to parse encoding base.
let has_digits = match self.nth_char(0) {
'b' => {
base = Base::Binary;
......@@ -351,17 +458,21 @@ fn number(&mut self, first_digit: char) -> LiteralKind {
self.bump();
self.eat_hexadecimal_digits()
}
// Not a base prefix.
'0'..='9' | '_' | '.' | 'e' | 'E' => {
self.eat_decimal_digits();
true
}
// just a 0
// Just a 0.
_ => return Int { base, empty_int: false },
};
// Base prefix was provided, but there were no digits
// after it, e.g. "0x".
if !has_digits {
return Int { base, empty_int: true };
}
} else {
// No base prefix, parse number in the usual way.
self.eat_decimal_digits();
};
......@@ -400,6 +511,9 @@ fn number(&mut self, first_digit: char) -> LiteralKind {
fn lifetime_or_char(&mut self) -> TokenKind {
debug_assert!(self.prev() == '\'');
let mut starts_with_number = false;
// Check if the first symbol after '\'' is a valid identifier
// character or a number (not a digit followed by '\'').
if (is_id_start(self.nth_char(0))
|| self.nth_char(0).is_digit(10) && {
starts_with_number = true;
......@@ -408,6 +522,8 @@ fn lifetime_or_char(&mut self) -> TokenKind {
&& self.nth_char(1) != '\''
{
self.bump();
// Skip the identifier.
while is_id_continue(self.nth_char(0)) {
self.bump();
}
......@@ -420,6 +536,8 @@ fn lifetime_or_char(&mut self) -> TokenKind {
Lifetime { starts_with_number }
};
}
// This is not a lifetime (checked above), parse a char literal.
let terminated = self.single_quoted_string();
let suffix_start = self.len_consumed();
if terminated {
......@@ -431,24 +549,32 @@ fn lifetime_or_char(&mut self) -> TokenKind {
fn single_quoted_string(&mut self) -> bool {
debug_assert!(self.prev() == '\'');
// parse `'''` as a single char literal
// Parse `'''` as a single char literal.
if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' {
self.bump();
}
// Parse until either quotes are terminated or error is detected.
let mut first = true;
loop {
match self.nth_char(0) {
// Probably beginning of the comment, which we don't want to include
// to the error report.
'/' if !first => break,
// Newline without following '\'' means unclosed quote, stop parsing.
'\n' if self.nth_char(1) != '\'' => break,
// End of file, stop parsing.
EOF_CHAR if self.is_eof() => break,
// Quotes are terminated, finish parsing.
'\'' => {
self.bump();
return true;
}
// Escaped slash is considered one character, so bump twice.
'\\' => {
self.bump();
self.bump();
}
// Skip the character.
_ => {
self.bump();
}
......@@ -458,6 +584,8 @@ fn single_quoted_string(&mut self) -> bool {
false
}
/// Eats double-quoted string and returns true
/// if string is terminated.
fn double_quoted_string(&mut self) -> bool {
debug_assert!(self.prev() == '"');
loop {
......@@ -476,8 +604,11 @@ fn double_quoted_string(&mut self) -> bool {
}
}
/// Eats the double-quoted string and returns a tuple of
/// (amount of the '#' symbols, raw string started, raw string terminated)
fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
debug_assert!(self.prev() == 'r');
// Count opening '#' symbols.
let n_hashes = {
let mut acc: usize = 0;
loop {
......@@ -489,6 +620,8 @@ fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
}
};
// Skip the string itself and check that amount of closing '#'
// symbols is equal to the amount of opening ones.
loop {
match self.bump() {
Some('"') => {
......@@ -549,6 +682,7 @@ fn float_exponent(&mut self) -> Result<(), ()> {
if self.eat_decimal_digits() { Ok(()) } else { Err(()) }
}
// Eats the suffix if it's an identifier.
fn eat_literal_suffix(&mut self) {
if !is_id_start(self.nth_char(0)) {
return;
......
......@@ -7,32 +7,54 @@
#[cfg(test)]
mod tests;
/// Errors that can occur during string unescaping.
#[derive(Debug, PartialEq, Eq)]
pub enum EscapeError {
/// Expected 1 char, but 0 were found.
ZeroChars,
/// Expected 1 char, but more than 1 were found.
MoreThanOneChar,
/// Escaped '\' character without continuation.
LoneSlash,
/// Invalid escape characted (e.g. '\z').
InvalidEscape,
/// Raw '\r' encountered.
BareCarriageReturn,
/// Raw '\r' encountered in raw string.
BareCarriageReturnInRawString,
/// Unescaped character that was expected to be escaped (e.g. raw '\t').
EscapeOnlyChar,
/// Numeric character escape is too short (e.g. '\x1').
TooShortHexEscape,
/// Invalid character in numeric escape (e.g. '\xz')
InvalidCharInHexEscape,
/// Character code in numeric escape is non-ascii (e.g. '\xFF').
OutOfRangeHexEscape,
/// '\u' not followed by '{'.
NoBraceInUnicodeEscape,
/// Non-hexadecimal value in '\u{..}'.
InvalidCharInUnicodeEscape,
/// '\u{}'
EmptyUnicodeEscape,
/// No closing brace in '\u{..}', e.g. '\u{12'.
UnclosedUnicodeEscape,
/// '\u{_12}'
LeadingUnderscoreUnicodeEscape,
/// More than 6 charactes in '\u{..}', e.g. '\u{10FFFF_FF}'
OverlongUnicodeEscape,
/// Invalid in-bound unicode character code, e.g. '\u{DFFF}'.
LoneSurrogateUnicodeEscape,
/// Out of bounds unicode character code, e.g. '\u{FFFFFF}'.
OutOfRangeUnicodeEscape,
/// Unicode escape code in byte literal.
UnicodeEscapeInByte,
/// Non-ascii character in byte literal.
NonAsciiCharInByte,
/// Non-ascii character in byte string literal.
NonAsciiCharInByteString,
}
......@@ -44,15 +66,8 @@ pub fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
.map_err(|err| (literal_text.len() - chars.as_str().len(), err))
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
pub fn unescape_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
unescape_str_or_byte_str(literal_text, Mode::Str, callback)
}
/// Takes a contents of a byte literal (without quotes), and returns an
/// unescaped byte or an error.
pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
let mut chars = literal_text.chars();
unescape_char_or_byte(&mut chars, Mode::Byte)
......@@ -62,6 +77,17 @@ pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
/// Values are returned through invoking of the provided callback.
pub fn unescape_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
unescape_str_or_byte_str(literal_text, Mode::Str, callback)
}
/// Takes a contents of a byte string literal (without quotes) and produces a
/// sequence of bytes or errors.
/// Values are returned through invoking of the provided callback.
pub fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
where
F: FnMut(Range<usize>, Result<u8, EscapeError>),
......@@ -71,8 +97,9 @@ pub fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
})
}
/// Takes a contents of a string literal (without quotes) and produces a
/// Takes a contents of a raw string literal (without quotes) and produces a
/// sequence of characters or errors.
/// Values are returned through invoking of the provided callback.
/// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only translate CRLF to LF and produce errors on bare CR.
pub fn unescape_raw_str<F>(literal_text: &str, callback: &mut F)
......@@ -82,8 +109,9 @@ pub fn unescape_raw_str<F>(literal_text: &str, callback: &mut F)
unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback)
}
/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of characters or errors.
/// Takes a contents of a raw byte string literal (without quotes) and produces a
/// sequence of bytes or errors.
/// Values are returned through invoking of the provided callback.
/// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only translate CRLF to LF and produce errors on bare CR.
pub fn unescape_raw_byte_str<F>(literal_text: &str, callback: &mut F)
......@@ -95,6 +123,7 @@ pub fn unescape_raw_byte_str<F>(literal_text: &str, callback: &mut F)
})
}
/// What kind of literal do we parse.
#[derive(Debug, Clone, Copy)]
pub enum Mode {
Char,
......@@ -126,6 +155,8 @@ pub fn is_bytes(self) -> bool {
fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
if first_char != '\\' {
// Previous character was not a slash, and we don't expect it to be
// an escape-only character.
return match first_char {
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
......@@ -133,6 +164,7 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
_ => {
if mode.is_bytes() && !first_char.is_ascii() {
// Byte literal can't be a non-ascii character.
return Err(EscapeError::NonAsciiCharInByte);
}
Ok(first_char)
......@@ -140,6 +172,8 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
};
}
// Previous character is '\\', try to unescape it.
let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
let res = match second_char {
......@@ -152,6 +186,8 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
'0' => '\0',
'x' => {
// Parse hexadecimal character code.
let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
......@@ -160,6 +196,7 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
let value = hi * 16 + lo;
// For a byte literal verify that it is within ASCII range.
if !mode.is_bytes() && !is_ascii(value) {
return Err(EscapeError::OutOfRangeHexEscape);
}
......@@ -169,10 +206,13 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
}
'u' => {
// We've parsed '\u', now we have to parse '{..}'.
if chars.next() != Some('{') {
return Err(EscapeError::NoBraceInUnicodeEscape);
}
// First characrer must be a hexadecimal digit.
let mut n_digits = 1;
let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
'_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
......@@ -180,6 +220,8 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
};
// First character is valid, now parse the rest of the number
// and closing brace.
loop {
match chars.next() {
None => return Err(EscapeError::UnclosedUnicodeEscape),
......@@ -188,6 +230,9 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
if n_digits > 6 {
return Err(EscapeError::OverlongUnicodeEscape);
}
// Incorrect syntax has higher priority for error reporting
// than unallowed value for a literal.
if mode.is_bytes() {
return Err(EscapeError::UnicodeEscapeInByte);
}
......@@ -204,6 +249,7 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
n_digits += 1;
if n_digits > 6 {
// Stop updating value since we're sure that it's is incorrect already.
continue;
}
let digit = digit as u32;
......@@ -243,6 +289,10 @@ fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
let second_char = chars.clone().next();
match second_char {
Some('\n') => {
// Rust language specification requires us to skip whitespaces
// if unescaped '\' character is followed by '\n'.
// For details see [Rust language reference]
// (https://doc.rust-lang.org/reference/tokens.html#string-literals).
skip_ascii_whitespace(&mut chars);
continue;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册