From 710a71719210ecdfca8c7bb185bcabef8cdeda49 Mon Sep 17 00:00:00 2001
From: Adam Barth <abarth@chromium.org>
Date: Mon, 27 Oct 2014 17:04:52 -0700
Subject: [PATCH] Parse comments according to parsing.md

Now we don't even generate comment tokens. The new tokenizer for comments is
much easier.

R=eseidel@chromium.org

Review URL: https://codereview.chromium.org/682893002
---
 engine/core/html/parser/AtomicHTMLToken.h    |  10 +-
 engine/core/html/parser/CompactHTMLToken.cpp |   1 -
 engine/core/html/parser/HTMLToken.h          |  25 +--
 engine/core/html/parser/HTMLTokenizer.cpp    | 158 ++++---------------
 engine/core/html/parser/HTMLTokenizer.h      |  17 +-
 engine/core/html/parser/HTMLTreeBuilder.cpp  |   3 +-
 tests/parser/comments-expected.txt           |   1 +
 tests/parser/comments.html                   |  15 ++
 8 files changed, 54 insertions(+), 176 deletions(-)
 create mode 100644 tests/parser/comments-expected.txt
 create mode 100644 tests/parser/comments.html

diff --git a/engine/core/html/parser/AtomicHTMLToken.h b/engine/core/html/parser/AtomicHTMLToken.h
index c23c42d04..0ba618499 100644
--- a/engine/core/html/parser/AtomicHTMLToken.h
+++ b/engine/core/html/parser/AtomicHTMLToken.h
@@ -83,12 +83,6 @@ public:
         return m_data;
     }
 
-    const String& comment() const
-    {
-        ASSERT(m_type == HTMLToken::Comment);
-        return m_data;
-    }
-
     explicit AtomicHTMLToken(HTMLToken& token)
         : m_type(token.type())
     {
@@ -109,7 +103,6 @@ public:
             break;
         }
         case HTMLToken::Character:
-        case HTMLToken::Comment:
             if (token.isAll8BitData())
                 m_data = String::make8BitFrom16BitSource(token.data());
             else
@@ -141,7 +134,6 @@ public:
             m_name = AtomicString(token.data());
             break;
         case HTMLToken::Character:
-        case HTMLToken::Comment:
             m_data = token.data();
             break;
         }
@@ -175,7 +167,7 @@ private:
     // "name" for StartTag and EndTag
     AtomicString m_name;
 
-    // "data" for Comment, "characters" for Character
+    // "characters" for Character
     String m_data;
 
     // For StartTag and EndTag
diff --git a/engine/core/html/parser/CompactHTMLToken.cpp b/engine/core/html/parser/CompactHTMLToken.cpp
index 46b4a5740..b4ec1c19b 100644
--- a/engine/core/html/parser/CompactHTMLToken.cpp
+++ b/engine/core/html/parser/CompactHTMLToken.cpp
@@ -59,7 +59,6 @@ CompactHTMLToken::CompactHTMLToken(const HTMLToken* token, const TextPosition& t
     case HTMLToken::EndTag:
         m_selfClosing = token->selfClosing();
         // Fall through!
-    case HTMLToken::Comment:
     case HTMLToken::Character: {
         m_isAll8BitData = token->isAll8BitData();
         m_data = attemptStaticStringCreation(token->data(), token->isAll8BitData() ? Force8Bit : Force16Bit);
diff --git a/engine/core/html/parser/HTMLToken.h b/engine/core/html/parser/HTMLToken.h
index 293459342..d716906bb 100644
--- a/engine/core/html/parser/HTMLToken.h
+++ b/engine/core/html/parser/HTMLToken.h
@@ -50,7 +50,6 @@ public:
         Uninitialized,
         StartTag,
         EndTag,
-        Comment,
         Character,
         EndOfFile,
     };
@@ -114,7 +113,7 @@ public:
 
     const DataVector& data() const
     {
-        ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
+        ASSERT(m_type == Character || m_type == StartTag || m_type == EndTag);
         return m_data;
     }
 
@@ -298,28 +297,6 @@ public:
         m_data.appendVector(characters);
     }
 
-    /* Comment Tokens */
-
-    const DataVector& comment() const
-    {
-        ASSERT(m_type == Comment);
-        return m_data;
-    }
-
-    void beginComment()
-    {
-        ASSERT(m_type == Uninitialized);
-        m_type = Comment;
-    }
-
-    void appendToComment(UChar character)
-    {
-        ASSERT(character);
-        ASSERT(m_type == Comment);
-        m_data.append(character);
-        m_orAllData |= character;
-    }
-
 private:
     Type m_type;
     Attribute::Range m_range; // Always starts at zero.
diff --git a/engine/core/html/parser/HTMLTokenizer.cpp b/engine/core/html/parser/HTMLTokenizer.cpp
index ca454ad3c..6c4c21099 100644
--- a/engine/core/html/parser/HTMLTokenizer.cpp
+++ b/engine/core/html/parser/HTMLTokenizer.cpp
@@ -235,21 +235,15 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
 
     HTML_BEGIN_STATE(TagOpenState) {
         if (cc == '!')
-            HTML_ADVANCE_TO(MarkupDeclarationOpenState);
+            HTML_ADVANCE_TO(CommentStart1State);
         else if (cc == '/')
-            HTML_ADVANCE_TO(EndTagOpenState);
+            HTML_ADVANCE_TO(CloseTagState);
         else if (isASCIIUpper(cc)) {
             m_token->beginStartTag(toLowerCase(cc));
             HTML_ADVANCE_TO(TagNameState);
         } else if (isASCIILower(cc)) {
             m_token->beginStartTag(cc);
             HTML_ADVANCE_TO(TagNameState);
-        } else if (cc == '?') {
-            parseError();
-            // The spec consumes the current character before switching
-            // to the bogus comment state, but it's easier to implement
-            // if we reconsume the current character.
-            HTML_RECONSUME_IN(BogusCommentState);
         } else {
             parseError();
             bufferCharacter('<');
@@ -258,7 +252,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
     }
     END_STATE()
 
-    HTML_BEGIN_STATE(EndTagOpenState) {
+    HTML_BEGIN_STATE(CloseTagState) {
         if (isASCIIUpper(cc)) {
             m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc)));
             m_appropriateEndTagName.clear();
@@ -268,16 +262,14 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
             m_appropriateEndTagName.clear();
             HTML_ADVANCE_TO(TagNameState);
         } else if (cc == '>') {
-            parseError();
+            bufferCharacter('<');
+            bufferCharacter('/');
+            bufferCharacter('>');
             HTML_ADVANCE_TO(DataState);
-        } else if (cc == kEndOfFileMarker) {
-            parseError();
+        } else {
             bufferCharacter('<');
             bufferCharacter('/');
             HTML_RECONSUME_IN(DataState);
-        } else {
-            parseError();
-            HTML_RECONSUME_IN(BogusCommentState);
         }
     }
     END_STATE()
@@ -571,144 +563,54 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
     }
     END_STATE()
 
-    HTML_BEGIN_STATE(BogusCommentState) {
-        m_token->beginComment();
-        HTML_RECONSUME_IN(ContinueBogusCommentState);
-    }
-    END_STATE()
-
-    HTML_BEGIN_STATE(ContinueBogusCommentState) {
-        if (cc == '>')
-            return emitAndResumeIn(source, HTMLTokenizer::DataState);
-        else if (cc == kEndOfFileMarker)
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        else {
-            m_token->appendToComment(cc);
-            HTML_ADVANCE_TO(ContinueBogusCommentState);
-        }
-    }
-    END_STATE()
-
-    HTML_BEGIN_STATE(MarkupDeclarationOpenState) {
+    HTML_BEGIN_STATE(CommentStart1State) {
         if (cc == '-') {
-            SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::dashDash);
-            if (result == SegmentedString::DidMatch) {
-                source.advanceAndASSERT('-');
-                source.advanceAndASSERT('-');
-                m_token->beginComment();
-                HTML_SWITCH_TO(CommentStartState);
-            } else if (result == SegmentedString::NotEnoughCharacters)
-                return haveBufferedCharacterToken();
-        }
-        parseError();
-        HTML_RECONSUME_IN(BogusCommentState);
-    }
-    END_STATE()
-
-    HTML_BEGIN_STATE(CommentStartState) {
-        if (cc == '-')
-            HTML_ADVANCE_TO(CommentStartDashState);
-        else if (cc == '>') {
-            parseError();
-            return emitAndResumeIn(source, HTMLTokenizer::DataState);
-        } else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
+            HTML_ADVANCE_TO(CommentStart2State);
         } else {
-            m_token->appendToComment(cc);
-            HTML_ADVANCE_TO(CommentState);
+            bufferCharacter('<');
+            bufferCharacter('!');
+            HTML_RECONSUME_IN(DataState);
         }
     }
     END_STATE()
 
-    HTML_BEGIN_STATE(CommentStartDashState) {
-        if (cc == '-')
-            HTML_ADVANCE_TO(CommentEndState);
-        else if (cc == '>') {
-            parseError();
-            return emitAndResumeIn(source, HTMLTokenizer::DataState);
-        } else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        } else {
-            m_token->appendToComment('-');
-            m_token->appendToComment(cc);
+    HTML_BEGIN_STATE(CommentStart2State) {
+        if (cc == '-') {
             HTML_ADVANCE_TO(CommentState);
+        } else {
+            bufferCharacter('<');
+            bufferCharacter('!');
+            bufferCharacter('-');
+            HTML_RECONSUME_IN(DataState);
         }
     }
     END_STATE()
 
     HTML_BEGIN_STATE(CommentState) {
         if (cc == '-')
-            HTML_ADVANCE_TO(CommentEndDashState);
-        else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        } else {
-            m_token->appendToComment(cc);
+            HTML_ADVANCE_TO(CommentEnd1State);
+        else
             HTML_ADVANCE_TO(CommentState);
-        }
     }
     END_STATE()
 
-    HTML_BEGIN_STATE(CommentEndDashState) {
+    HTML_BEGIN_STATE(CommentEnd1State) {
         if (cc == '-')
-            HTML_ADVANCE_TO(CommentEndState);
-        else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        } else {
-            m_token->appendToComment('-');
-            m_token->appendToComment(cc);
-            HTML_ADVANCE_TO(CommentState);
-        }
-    }
-    END_STATE()
-
-    HTML_BEGIN_STATE(CommentEndState) {
-        if (cc == '>')
-            return emitAndResumeIn(source, HTMLTokenizer::DataState);
-        else if (cc == '!') {
-            parseError();
-            HTML_ADVANCE_TO(CommentEndBangState);
-        } else if (cc == '-') {
-            parseError();
-            m_token->appendToComment('-');
-            HTML_ADVANCE_TO(CommentEndState);
-        } else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        } else {
-            parseError();
-            m_token->appendToComment('-');
-            m_token->appendToComment('-');
-            m_token->appendToComment(cc);
+            HTML_ADVANCE_TO(CommentEnd2State);
+        else
             HTML_ADVANCE_TO(CommentState);
-        }
     }
     END_STATE()
 
-    HTML_BEGIN_STATE(CommentEndBangState) {
-        if (cc == '-') {
-            m_token->appendToComment('-');
-            m_token->appendToComment('-');
-            m_token->appendToComment('!');
-            HTML_ADVANCE_TO(CommentEndDashState);
-        } else if (cc == '>')
-            return emitAndResumeIn(source, HTMLTokenizer::DataState);
-        else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        } else {
-            m_token->appendToComment('-');
-            m_token->appendToComment('-');
-            m_token->appendToComment('!');
-            m_token->appendToComment(cc);
+    HTML_BEGIN_STATE(CommentEnd2State) {
+        if (cc == '-')
+            HTML_ADVANCE_TO(CommentEnd2State);
+        else if (cc == '>')
+            HTML_ADVANCE_TO(DataState);
+        else
             HTML_ADVANCE_TO(CommentState);
-        }
     }
     END_STATE()
-
     }
 
     ASSERT_NOT_REACHED();
diff --git a/engine/core/html/parser/HTMLTokenizer.h b/engine/core/html/parser/HTMLTokenizer.h
index a22cf50df..bc8c48af0 100644
--- a/engine/core/html/parser/HTMLTokenizer.h
+++ b/engine/core/html/parser/HTMLTokenizer.h
@@ -49,7 +49,7 @@ public:
         CharacterReferenceInAttributeValueState,
         RAWTEXTState,
         TagOpenState,
-        EndTagOpenState,
+        CloseTagState,
         TagNameState,
         RAWTEXTLessThanSignState,
         RAWTEXTEndTagOpenState,
@@ -63,18 +63,11 @@ public:
         AttributeValueUnquotedState,
         AfterAttributeValueQuotedState,
         SelfClosingStartTagState,
-        BogusCommentState,
-        // The ContinueBogusCommentState is not in the HTML5 spec, but we use
-        // it internally to keep track of whether we've started the bogus
-        // comment token yet.
-        ContinueBogusCommentState,
-        MarkupDeclarationOpenState,
-        CommentStartState,
-        CommentStartDashState,
+        CommentStart1State,
+        CommentStart2State,
         CommentState,
-        CommentEndDashState,
-        CommentEndState,
-        CommentEndBangState,
+        CommentEnd1State,
+        CommentEnd2State,
     };
 
     // This function returns true if it emits a token. Otherwise, callers
diff --git a/engine/core/html/parser/HTMLTreeBuilder.cpp b/engine/core/html/parser/HTMLTreeBuilder.cpp
index 89fc3a854..710a99df3 100644
--- a/engine/core/html/parser/HTMLTreeBuilder.cpp
+++ b/engine/core/html/parser/HTMLTreeBuilder.cpp
@@ -128,8 +128,7 @@ void HTMLTreeBuilder::constructTree(AtomicHTMLToken* token)
     } else if (type == HTMLToken::EndOfFile) {
         processEndOfFile(token);
     } else {
-        // We ignore Comments.
-        ASSERT(type == HTMLToken::Comment);
+        ASSERT_NOT_REACHED();
     }
 
     m_tree.executeQueuedTasks();
diff --git a/tests/parser/comments-expected.txt b/tests/parser/comments-expected.txt
new file mode 100644
index 000000000..8de7aeba2
--- /dev/null
+++ b/tests/parser/comments-expected.txt
@@ -0,0 +1 @@
+< <! <!- <!-> --> -> > </ </>
diff --git a/tests/parser/comments.html b/tests/parser/comments.html
new file mode 100644
index 000000000..777314308
--- /dev/null
+++ b/tests/parser/comments.html
@@ -0,0 +1,15 @@
+<html>
+<link rel="import" href="../resources/dump-as-text.html" />
+<body>
+<
+<!
+<!-
+<!->
+<!-->aaa-->
+-->
+->
+>
+</
+</>
+</body>
+</html>
-- 
GitLab