From 710a71719210ecdfca8c7bb185bcabef8cdeda49 Mon Sep 17 00:00:00 2001 From: Adam Barth Date: Mon, 27 Oct 2014 17:04:52 -0700 Subject: [PATCH] Parse comments according to parsing.md Now we don't even generate comment tokens. The new tokenizer for comments is much easier. R=eseidel@chromium.org Review URL: https://codereview.chromium.org/682893002 --- engine/core/html/parser/AtomicHTMLToken.h | 10 +- engine/core/html/parser/CompactHTMLToken.cpp | 1 - engine/core/html/parser/HTMLToken.h | 25 +-- engine/core/html/parser/HTMLTokenizer.cpp | 158 ++++--------------- engine/core/html/parser/HTMLTokenizer.h | 17 +- engine/core/html/parser/HTMLTreeBuilder.cpp | 3 +- tests/parser/comments-expected.txt | 1 + tests/parser/comments.html | 15 ++ 8 files changed, 54 insertions(+), 176 deletions(-) create mode 100644 tests/parser/comments-expected.txt create mode 100644 tests/parser/comments.html diff --git a/engine/core/html/parser/AtomicHTMLToken.h b/engine/core/html/parser/AtomicHTMLToken.h index c23c42d04..0ba618499 100644 --- a/engine/core/html/parser/AtomicHTMLToken.h +++ b/engine/core/html/parser/AtomicHTMLToken.h @@ -83,12 +83,6 @@ public: return m_data; } - const String& comment() const - { - ASSERT(m_type == HTMLToken::Comment); - return m_data; - } - explicit AtomicHTMLToken(HTMLToken& token) : m_type(token.type()) { @@ -109,7 +103,6 @@ public: break; } case HTMLToken::Character: - case HTMLToken::Comment: if (token.isAll8BitData()) m_data = String::make8BitFrom16BitSource(token.data()); else @@ -141,7 +134,6 @@ public: m_name = AtomicString(token.data()); break; case HTMLToken::Character: - case HTMLToken::Comment: m_data = token.data(); break; } @@ -175,7 +167,7 @@ private: // "name" for StartTag and EndTag AtomicString m_name; - // "data" for Comment, "characters" for Character + // "characters" for Character String m_data; // For StartTag and EndTag diff --git a/engine/core/html/parser/CompactHTMLToken.cpp b/engine/core/html/parser/CompactHTMLToken.cpp index 46b4a5740..b4ec1c19b 100644 --- a/engine/core/html/parser/CompactHTMLToken.cpp +++ b/engine/core/html/parser/CompactHTMLToken.cpp @@ -59,7 +59,6 @@ CompactHTMLToken::CompactHTMLToken(const HTMLToken* token, const TextPosition& t case HTMLToken::EndTag: m_selfClosing = token->selfClosing(); // Fall through! - case HTMLToken::Comment: case HTMLToken::Character: { m_isAll8BitData = token->isAll8BitData(); m_data = attemptStaticStringCreation(token->data(), token->isAll8BitData() ? Force8Bit : Force16Bit); diff --git a/engine/core/html/parser/HTMLToken.h b/engine/core/html/parser/HTMLToken.h index 293459342..d716906bb 100644 --- a/engine/core/html/parser/HTMLToken.h +++ b/engine/core/html/parser/HTMLToken.h @@ -50,7 +50,6 @@ public: Uninitialized, StartTag, EndTag, - Comment, Character, EndOfFile, }; @@ -114,7 +113,7 @@ public: const DataVector& data() const { - ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag); + ASSERT(m_type == Character || m_type == StartTag || m_type == EndTag); return m_data; } @@ -298,28 +297,6 @@ public: m_data.appendVector(characters); } - /* Comment Tokens */ - - const DataVector& comment() const - { - ASSERT(m_type == Comment); - return m_data; - } - - void beginComment() - { - ASSERT(m_type == Uninitialized); - m_type = Comment; - } - - void appendToComment(UChar character) - { - ASSERT(character); - ASSERT(m_type == Comment); - m_data.append(character); - m_orAllData |= character; - } - private: Type m_type; Attribute::Range m_range; // Always starts at zero. diff --git a/engine/core/html/parser/HTMLTokenizer.cpp b/engine/core/html/parser/HTMLTokenizer.cpp index ca454ad3c..6c4c21099 100644 --- a/engine/core/html/parser/HTMLTokenizer.cpp +++ b/engine/core/html/parser/HTMLTokenizer.cpp @@ -235,21 +235,15 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) HTML_BEGIN_STATE(TagOpenState) { if (cc == '!') - HTML_ADVANCE_TO(MarkupDeclarationOpenState); + HTML_ADVANCE_TO(CommentStart1State); else if (cc == '/') - HTML_ADVANCE_TO(EndTagOpenState); + HTML_ADVANCE_TO(CloseTagState); else if (isASCIIUpper(cc)) { m_token->beginStartTag(toLowerCase(cc)); HTML_ADVANCE_TO(TagNameState); } else if (isASCIILower(cc)) { m_token->beginStartTag(cc); HTML_ADVANCE_TO(TagNameState); - } else if (cc == '?') { - parseError(); - // The spec consumes the current character before switching - // to the bogus comment state, but it's easier to implement - // if we reconsume the current character. - HTML_RECONSUME_IN(BogusCommentState); } else { parseError(); bufferCharacter('<'); @@ -258,7 +252,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) } END_STATE() - HTML_BEGIN_STATE(EndTagOpenState) { + HTML_BEGIN_STATE(CloseTagState) { if (isASCIIUpper(cc)) { m_token->beginEndTag(static_cast(toLowerCase(cc))); m_appropriateEndTagName.clear(); @@ -268,16 +262,14 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) m_appropriateEndTagName.clear(); HTML_ADVANCE_TO(TagNameState); } else if (cc == '>') { - parseError(); + bufferCharacter('<'); + bufferCharacter('/'); + bufferCharacter('>'); HTML_ADVANCE_TO(DataState); - } else if (cc == kEndOfFileMarker) { - parseError(); + } else { bufferCharacter('<'); bufferCharacter('/'); HTML_RECONSUME_IN(DataState); - } else { - parseError(); - HTML_RECONSUME_IN(BogusCommentState); } } END_STATE() @@ -571,144 +563,54 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) } END_STATE() - HTML_BEGIN_STATE(BogusCommentState) { - m_token->beginComment(); - HTML_RECONSUME_IN(ContinueBogusCommentState); - } - END_STATE() - - HTML_BEGIN_STATE(ContinueBogusCommentState) { - if (cc == '>') - return emitAndResumeIn(source, HTMLTokenizer::DataState); - else if (cc == kEndOfFileMarker) - return emitAndReconsumeIn(source, HTMLTokenizer::DataState); - else { - m_token->appendToComment(cc); - HTML_ADVANCE_TO(ContinueBogusCommentState); - } - } - END_STATE() - - HTML_BEGIN_STATE(MarkupDeclarationOpenState) { + HTML_BEGIN_STATE(CommentStart1State) { if (cc == '-') { - SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::dashDash); - if (result == SegmentedString::DidMatch) { - source.advanceAndASSERT('-'); - source.advanceAndASSERT('-'); - m_token->beginComment(); - HTML_SWITCH_TO(CommentStartState); - } else if (result == SegmentedString::NotEnoughCharacters) - return haveBufferedCharacterToken(); - } - parseError(); - HTML_RECONSUME_IN(BogusCommentState); - } - END_STATE() - - HTML_BEGIN_STATE(CommentStartState) { - if (cc == '-') - HTML_ADVANCE_TO(CommentStartDashState); - else if (cc == '>') { - parseError(); - return emitAndResumeIn(source, HTMLTokenizer::DataState); - } else if (cc == kEndOfFileMarker) { - parseError(); - return emitAndReconsumeIn(source, HTMLTokenizer::DataState); + HTML_ADVANCE_TO(CommentStart2State); } else { - m_token->appendToComment(cc); - HTML_ADVANCE_TO(CommentState); + bufferCharacter('<'); + bufferCharacter('!'); + HTML_RECONSUME_IN(DataState); } } END_STATE() - HTML_BEGIN_STATE(CommentStartDashState) { - if (cc == '-') - HTML_ADVANCE_TO(CommentEndState); - else if (cc == '>') { - parseError(); - return emitAndResumeIn(source, HTMLTokenizer::DataState); - } else if (cc == kEndOfFileMarker) { - parseError(); - return emitAndReconsumeIn(source, HTMLTokenizer::DataState); - } else { - m_token->appendToComment('-'); - m_token->appendToComment(cc); + HTML_BEGIN_STATE(CommentStart2State) { + if (cc == '-') { HTML_ADVANCE_TO(CommentState); + } else { + bufferCharacter('<'); + bufferCharacter('!'); + bufferCharacter('-'); + HTML_RECONSUME_IN(DataState); } } END_STATE() HTML_BEGIN_STATE(CommentState) { if (cc == '-') - HTML_ADVANCE_TO(CommentEndDashState); - else if (cc == kEndOfFileMarker) { - parseError(); - return emitAndReconsumeIn(source, HTMLTokenizer::DataState); - } else { - m_token->appendToComment(cc); + HTML_ADVANCE_TO(CommentEnd1State); + else HTML_ADVANCE_TO(CommentState); - } } END_STATE() - HTML_BEGIN_STATE(CommentEndDashState) { + HTML_BEGIN_STATE(CommentEnd1State) { if (cc == '-') - HTML_ADVANCE_TO(CommentEndState); - else if (cc == kEndOfFileMarker) { - parseError(); - return emitAndReconsumeIn(source, HTMLTokenizer::DataState); - } else { - m_token->appendToComment('-'); - m_token->appendToComment(cc); - HTML_ADVANCE_TO(CommentState); - } - } - END_STATE() - - HTML_BEGIN_STATE(CommentEndState) { - if (cc == '>') - return emitAndResumeIn(source, HTMLTokenizer::DataState); - else if (cc == '!') { - parseError(); - HTML_ADVANCE_TO(CommentEndBangState); - } else if (cc == '-') { - parseError(); - m_token->appendToComment('-'); - HTML_ADVANCE_TO(CommentEndState); - } else if (cc == kEndOfFileMarker) { - parseError(); - return emitAndReconsumeIn(source, HTMLTokenizer::DataState); - } else { - parseError(); - m_token->appendToComment('-'); - m_token->appendToComment('-'); - m_token->appendToComment(cc); + HTML_ADVANCE_TO(CommentEnd2State); + else HTML_ADVANCE_TO(CommentState); - } } END_STATE() - HTML_BEGIN_STATE(CommentEndBangState) { - if (cc == '-') { - m_token->appendToComment('-'); - m_token->appendToComment('-'); - m_token->appendToComment('!'); - HTML_ADVANCE_TO(CommentEndDashState); - } else if (cc == '>') - return emitAndResumeIn(source, HTMLTokenizer::DataState); - else if (cc == kEndOfFileMarker) { - parseError(); - return emitAndReconsumeIn(source, HTMLTokenizer::DataState); - } else { - m_token->appendToComment('-'); - m_token->appendToComment('-'); - m_token->appendToComment('!'); - m_token->appendToComment(cc); + HTML_BEGIN_STATE(CommentEnd2State) { + if (cc == '-') + HTML_ADVANCE_TO(CommentEnd2State); + else if (cc == '>') + HTML_ADVANCE_TO(DataState); + else HTML_ADVANCE_TO(CommentState); - } } END_STATE() - } ASSERT_NOT_REACHED(); diff --git a/engine/core/html/parser/HTMLTokenizer.h b/engine/core/html/parser/HTMLTokenizer.h index a22cf50df..bc8c48af0 100644 --- a/engine/core/html/parser/HTMLTokenizer.h +++ b/engine/core/html/parser/HTMLTokenizer.h @@ -49,7 +49,7 @@ public: CharacterReferenceInAttributeValueState, RAWTEXTState, TagOpenState, - EndTagOpenState, + CloseTagState, TagNameState, RAWTEXTLessThanSignState, RAWTEXTEndTagOpenState, @@ -63,18 +63,11 @@ public: AttributeValueUnquotedState, AfterAttributeValueQuotedState, SelfClosingStartTagState, - BogusCommentState, - // The ContinueBogusCommentState is not in the HTML5 spec, but we use - // it internally to keep track of whether we've started the bogus - // comment token yet. - ContinueBogusCommentState, - MarkupDeclarationOpenState, - CommentStartState, - CommentStartDashState, + CommentStart1State, + CommentStart2State, CommentState, - CommentEndDashState, - CommentEndState, - CommentEndBangState, + CommentEnd1State, + CommentEnd2State, }; // This function returns true if it emits a token. Otherwise, callers diff --git a/engine/core/html/parser/HTMLTreeBuilder.cpp b/engine/core/html/parser/HTMLTreeBuilder.cpp index 89fc3a854..710a99df3 100644 --- a/engine/core/html/parser/HTMLTreeBuilder.cpp +++ b/engine/core/html/parser/HTMLTreeBuilder.cpp @@ -128,8 +128,7 @@ void HTMLTreeBuilder::constructTree(AtomicHTMLToken* token) } else if (type == HTMLToken::EndOfFile) { processEndOfFile(token); } else { - // We ignore Comments. - ASSERT(type == HTMLToken::Comment); + ASSERT_NOT_REACHED(); } m_tree.executeQueuedTasks(); diff --git a/tests/parser/comments-expected.txt b/tests/parser/comments-expected.txt new file mode 100644 index 000000000..8de7aeba2 --- /dev/null +++ b/tests/parser/comments-expected.txt @@ -0,0 +1 @@ +< --> -> > diff --git a/tests/parser/comments.html b/tests/parser/comments.html new file mode 100644 index 000000000..777314308 --- /dev/null +++ b/tests/parser/comments.html @@ -0,0 +1,15 @@ + + + +< + +aaa--> +--> +-> +> + + + -- GitLab