提交 710a7171 编写于 作者: A Adam Barth

Parse comments according to parsing.md

Now we don't even generate comment tokens. The new tokenizer for comments is
much easier.

R=eseidel@chromium.org

Review URL: https://codereview.chromium.org/682893002
上级 36288fcd
......@@ -83,12 +83,6 @@ public:
return m_data;
}
const String& comment() const
{
ASSERT(m_type == HTMLToken::Comment);
return m_data;
}
explicit AtomicHTMLToken(HTMLToken& token)
: m_type(token.type())
{
......@@ -109,7 +103,6 @@ public:
break;
}
case HTMLToken::Character:
case HTMLToken::Comment:
if (token.isAll8BitData())
m_data = String::make8BitFrom16BitSource(token.data());
else
......@@ -141,7 +134,6 @@ public:
m_name = AtomicString(token.data());
break;
case HTMLToken::Character:
case HTMLToken::Comment:
m_data = token.data();
break;
}
......@@ -175,7 +167,7 @@ private:
// "name" for StartTag and EndTag
AtomicString m_name;
// "data" for Comment, "characters" for Character
// "characters" for Character
String m_data;
// For StartTag and EndTag
......
......@@ -59,7 +59,6 @@ CompactHTMLToken::CompactHTMLToken(const HTMLToken* token, const TextPosition& t
case HTMLToken::EndTag:
m_selfClosing = token->selfClosing();
// Fall through!
case HTMLToken::Comment:
case HTMLToken::Character: {
m_isAll8BitData = token->isAll8BitData();
m_data = attemptStaticStringCreation(token->data(), token->isAll8BitData() ? Force8Bit : Force16Bit);
......
......@@ -50,7 +50,6 @@ public:
Uninitialized,
StartTag,
EndTag,
Comment,
Character,
EndOfFile,
};
......@@ -114,7 +113,7 @@ public:
const DataVector& data() const
{
ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
ASSERT(m_type == Character || m_type == StartTag || m_type == EndTag);
return m_data;
}
......@@ -298,28 +297,6 @@ public:
m_data.appendVector(characters);
}
/* Comment Tokens */
const DataVector& comment() const
{
ASSERT(m_type == Comment);
return m_data;
}
void beginComment()
{
ASSERT(m_type == Uninitialized);
m_type = Comment;
}
void appendToComment(UChar character)
{
ASSERT(character);
ASSERT(m_type == Comment);
m_data.append(character);
m_orAllData |= character;
}
private:
Type m_type;
Attribute::Range m_range; // Always starts at zero.
......
......@@ -235,21 +235,15 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
HTML_BEGIN_STATE(TagOpenState) {
if (cc == '!')
HTML_ADVANCE_TO(MarkupDeclarationOpenState);
HTML_ADVANCE_TO(CommentStart1State);
else if (cc == '/')
HTML_ADVANCE_TO(EndTagOpenState);
HTML_ADVANCE_TO(CloseTagState);
else if (isASCIIUpper(cc)) {
m_token->beginStartTag(toLowerCase(cc));
HTML_ADVANCE_TO(TagNameState);
} else if (isASCIILower(cc)) {
m_token->beginStartTag(cc);
HTML_ADVANCE_TO(TagNameState);
} else if (cc == '?') {
parseError();
// The spec consumes the current character before switching
// to the bogus comment state, but it's easier to implement
// if we reconsume the current character.
HTML_RECONSUME_IN(BogusCommentState);
} else {
parseError();
bufferCharacter('<');
......@@ -258,7 +252,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
}
END_STATE()
HTML_BEGIN_STATE(EndTagOpenState) {
HTML_BEGIN_STATE(CloseTagState) {
if (isASCIIUpper(cc)) {
m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc)));
m_appropriateEndTagName.clear();
......@@ -268,16 +262,14 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
m_appropriateEndTagName.clear();
HTML_ADVANCE_TO(TagNameState);
} else if (cc == '>') {
parseError();
bufferCharacter('<');
bufferCharacter('/');
bufferCharacter('>');
HTML_ADVANCE_TO(DataState);
} else if (cc == kEndOfFileMarker) {
parseError();
} else {
bufferCharacter('<');
bufferCharacter('/');
HTML_RECONSUME_IN(DataState);
} else {
parseError();
HTML_RECONSUME_IN(BogusCommentState);
}
}
END_STATE()
......@@ -571,144 +563,54 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
}
END_STATE()
HTML_BEGIN_STATE(BogusCommentState) {
m_token->beginComment();
HTML_RECONSUME_IN(ContinueBogusCommentState);
}
END_STATE()
HTML_BEGIN_STATE(ContinueBogusCommentState) {
if (cc == '>')
return emitAndResumeIn(source, HTMLTokenizer::DataState);
else if (cc == kEndOfFileMarker)
return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
else {
m_token->appendToComment(cc);
HTML_ADVANCE_TO(ContinueBogusCommentState);
}
}
END_STATE()
HTML_BEGIN_STATE(MarkupDeclarationOpenState) {
HTML_BEGIN_STATE(CommentStart1State) {
if (cc == '-') {
SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::dashDash);
if (result == SegmentedString::DidMatch) {
source.advanceAndASSERT('-');
source.advanceAndASSERT('-');
m_token->beginComment();
HTML_SWITCH_TO(CommentStartState);
} else if (result == SegmentedString::NotEnoughCharacters)
return haveBufferedCharacterToken();
}
parseError();
HTML_RECONSUME_IN(BogusCommentState);
}
END_STATE()
HTML_BEGIN_STATE(CommentStartState) {
if (cc == '-')
HTML_ADVANCE_TO(CommentStartDashState);
else if (cc == '>') {
parseError();
return emitAndResumeIn(source, HTMLTokenizer::DataState);
} else if (cc == kEndOfFileMarker) {
parseError();
return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
HTML_ADVANCE_TO(CommentStart2State);
} else {
m_token->appendToComment(cc);
HTML_ADVANCE_TO(CommentState);
bufferCharacter('<');
bufferCharacter('!');
HTML_RECONSUME_IN(DataState);
}
}
END_STATE()
HTML_BEGIN_STATE(CommentStartDashState) {
if (cc == '-')
HTML_ADVANCE_TO(CommentEndState);
else if (cc == '>') {
parseError();
return emitAndResumeIn(source, HTMLTokenizer::DataState);
} else if (cc == kEndOfFileMarker) {
parseError();
return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
} else {
m_token->appendToComment('-');
m_token->appendToComment(cc);
HTML_BEGIN_STATE(CommentStart2State) {
if (cc == '-') {
HTML_ADVANCE_TO(CommentState);
} else {
bufferCharacter('<');
bufferCharacter('!');
bufferCharacter('-');
HTML_RECONSUME_IN(DataState);
}
}
END_STATE()
HTML_BEGIN_STATE(CommentState) {
if (cc == '-')
HTML_ADVANCE_TO(CommentEndDashState);
else if (cc == kEndOfFileMarker) {
parseError();
return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
} else {
m_token->appendToComment(cc);
HTML_ADVANCE_TO(CommentEnd1State);
else
HTML_ADVANCE_TO(CommentState);
}
}
END_STATE()
HTML_BEGIN_STATE(CommentEndDashState) {
HTML_BEGIN_STATE(CommentEnd1State) {
if (cc == '-')
HTML_ADVANCE_TO(CommentEndState);
else if (cc == kEndOfFileMarker) {
parseError();
return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
} else {
m_token->appendToComment('-');
m_token->appendToComment(cc);
HTML_ADVANCE_TO(CommentState);
}
}
END_STATE()
HTML_BEGIN_STATE(CommentEndState) {
if (cc == '>')
return emitAndResumeIn(source, HTMLTokenizer::DataState);
else if (cc == '!') {
parseError();
HTML_ADVANCE_TO(CommentEndBangState);
} else if (cc == '-') {
parseError();
m_token->appendToComment('-');
HTML_ADVANCE_TO(CommentEndState);
} else if (cc == kEndOfFileMarker) {
parseError();
return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
} else {
parseError();
m_token->appendToComment('-');
m_token->appendToComment('-');
m_token->appendToComment(cc);
HTML_ADVANCE_TO(CommentEnd2State);
else
HTML_ADVANCE_TO(CommentState);
}
}
END_STATE()
HTML_BEGIN_STATE(CommentEndBangState) {
if (cc == '-') {
m_token->appendToComment('-');
m_token->appendToComment('-');
m_token->appendToComment('!');
HTML_ADVANCE_TO(CommentEndDashState);
} else if (cc == '>')
return emitAndResumeIn(source, HTMLTokenizer::DataState);
else if (cc == kEndOfFileMarker) {
parseError();
return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
} else {
m_token->appendToComment('-');
m_token->appendToComment('-');
m_token->appendToComment('!');
m_token->appendToComment(cc);
HTML_BEGIN_STATE(CommentEnd2State) {
if (cc == '-')
HTML_ADVANCE_TO(CommentEnd2State);
else if (cc == '>')
HTML_ADVANCE_TO(DataState);
else
HTML_ADVANCE_TO(CommentState);
}
}
END_STATE()
}
ASSERT_NOT_REACHED();
......
......@@ -49,7 +49,7 @@ public:
CharacterReferenceInAttributeValueState,
RAWTEXTState,
TagOpenState,
EndTagOpenState,
CloseTagState,
TagNameState,
RAWTEXTLessThanSignState,
RAWTEXTEndTagOpenState,
......@@ -63,18 +63,11 @@ public:
AttributeValueUnquotedState,
AfterAttributeValueQuotedState,
SelfClosingStartTagState,
BogusCommentState,
// The ContinueBogusCommentState is not in the HTML5 spec, but we use
// it internally to keep track of whether we've started the bogus
// comment token yet.
ContinueBogusCommentState,
MarkupDeclarationOpenState,
CommentStartState,
CommentStartDashState,
CommentStart1State,
CommentStart2State,
CommentState,
CommentEndDashState,
CommentEndState,
CommentEndBangState,
CommentEnd1State,
CommentEnd2State,
};
// This function returns true if it emits a token. Otherwise, callers
......
......@@ -128,8 +128,7 @@ void HTMLTreeBuilder::constructTree(AtomicHTMLToken* token)
} else if (type == HTMLToken::EndOfFile) {
processEndOfFile(token);
} else {
// We ignore Comments.
ASSERT(type == HTMLToken::Comment);
ASSERT_NOT_REACHED();
}
m_tree.executeQueuedTasks();
......
< <! <!- <!-> --> -> > </ </>
<html>
<link rel="import" href="../resources/dump-as-text.html" />
<body>
<
<!
<!-
<!->
<!-->aaa-->
-->
->
>
</
</>
</body>
</html>
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册