diff --git a/engine/core/html/parser/HTMLEntityParser.cpp b/engine/core/html/parser/HTMLEntityParser.cpp
index dc30c83715f4f72ba2b92a598ad4d987b51370c8..6d90746bbe62d43be202489f887aa0f010c27cc7 100644
--- a/engine/core/html/parser/HTMLEntityParser.cpp
+++ b/engine/core/html/parser/HTMLEntityParser.cpp
@@ -1,305 +1,171 @@
-/*
- * Copyright (C) 2008 Apple Inc. All Rights Reserved.
- * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
- * Copyright (C) 2010 Google, Inc. All Rights Reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
#include "config.h"
#include "core/html/parser/HTMLEntityParser.h"
-#include "core/html/parser/HTMLEntitySearch.h"
-#include "core/html/parser/HTMLEntityTable.h"
-#include "wtf/text/StringBuilder.h"
+#include "wtf/unicode/CharacterNames.h"
using namespace WTF;
namespace blink {
-static const UChar windowsLatin1ExtensionArray[32] = {
- 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
- 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
- 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
- 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
-};
-
-static bool isAlphaNumeric(UChar cc)
-{
- return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
-}
+static const UChar32 kInvalidUnicode = -1;
-static UChar adjustEntity(UChar32 value)
+static UChar asHexDigit(UChar cc)
{
- if ((value & ~0x1F) != 0x0080)
- return value;
- return windowsLatin1ExtensionArray[value - 0x80];
+ if (cc >= '0' && cc <= '9')
+ return cc - '0';
+ if (cc >= 'a' && cc <= 'f')
+ return 10 + cc - 'a';
+ if (cc >= 'A' && cc <= 'F')
+ return 10 + cc - 'A';
+ ASSERT_NOT_REACHED();
+ return 0;
}
-static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
+static bool isAlphaNumeric(UChar cc)
{
- // FIXME: A number of specific entity values generate parse errors.
- if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
- decodedEntity.append(0xFFFD);
- return;
- }
- if (U_IS_BMP(c)) {
- decodedEntity.append(adjustEntity(c));
- return;
- }
- decodedEntity.append(c);
+ return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
}
-static const UChar32 kInvalidUnicode = -1;
-
static bool isHexDigit(UChar cc)
{
return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
}
-static UChar asHexDigit(UChar cc)
+static UChar decodeEntity(HTMLEntityParser::OutputBuffer buffer)
{
- if (cc >= '0' && cc <= '9')
- return cc - '0';
- if (cc >= 'a' && cc <= 'z')
- return 10 + cc - 'a';
- if (cc >= 'A' && cc <= 'Z')
- return 10 + cc - 'A';
- ASSERT_NOT_REACHED();
- return 0;
+ if (equalIgnoringNullity(buffer, "&"))
+ return '&';
+ if (equalIgnoringNullity(buffer, "&apos"))
+ return '\'';
+ if (equalIgnoringNullity(buffer, ">"))
+ return '>';
+ if (equalIgnoringNullity(buffer, "<"))
+ return '<';
+ if (equalIgnoringNullity(buffer, """))
+ return '"';
+ return replacementCharacter;
}
-typedef Vector ConsumedCharacterBuffer;
-
-static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)
+HTMLEntityParser::HTMLEntityParser()
{
- if (consumedCharacters.size() == 1)
- source.push(consumedCharacters[0]);
- else if (consumedCharacters.size() == 2) {
- source.push(consumedCharacters[0]);
- source.push(consumedCharacters[1]);
- } else
- source.prepend(SegmentedString(String(consumedCharacters)));
}
-static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
+HTMLEntityParser::~HTMLEntityParser()
{
- ConsumedCharacterBuffer consumedCharacters;
- HTMLEntitySearch entitySearch;
- while (!source.isEmpty()) {
- cc = source.currentChar();
- entitySearch.advance(cc);
- if (!entitySearch.isEntityPrefix())
- break;
- consumedCharacters.append(cc);
- source.advanceAndASSERT(cc);
- }
- notEnoughCharacters = source.isEmpty();
- if (notEnoughCharacters) {
- // We can't decide on an entity because there might be a longer entity
- // that we could match if we had more data.
- unconsumeCharacters(source, consumedCharacters);
- return false;
- }
- if (!entitySearch.mostRecentMatch()) {
- unconsumeCharacters(source, consumedCharacters);
- return false;
- }
- if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
- // We've consumed too many characters. We need to walk the
- // source back to the point at which we had consumed an
- // actual entity.
- unconsumeCharacters(source, consumedCharacters);
- consumedCharacters.clear();
- const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch();
- const int length = mostRecent->length;
- const LChar* reference = HTMLEntityTable::entityString(*mostRecent);
- for (int i = 0; i < length; ++i) {
- cc = source.currentChar();
- ASSERT_UNUSED(reference, cc == static_cast(*reference++));
- consumedCharacters.append(cc);
- source.advanceAndASSERT(cc);
- ASSERT(!source.isEmpty());
- }
- cc = source.currentChar();
- }
- if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
- || !additionalAllowedCharacter
- || !(isAlphaNumeric(cc) || cc == '=')) {
- decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
- if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
- decodedEntity.append(second);
- return true;
- }
- unconsumeCharacters(source, consumedCharacters);
- return false;
}
-bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
+void HTMLEntityParser::reset()
{
- ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
- ASSERT(!notEnoughCharacters);
- ASSERT(decodedEntity.isEmpty());
-
- enum EntityState {
- Initial,
- Number,
- MaybeHexLowerCaseX,
- MaybeHexUpperCaseX,
- Hex,
- Decimal,
- Named
- };
- EntityState entityState = Initial;
- UChar32 result = 0;
- ConsumedCharacterBuffer consumedCharacters;
+ m_state = Initial;
+ m_result = '\0';
+ m_buffer.clear();
+ m_buffer.append('&');
+}
+bool HTMLEntityParser::parse(SegmentedString& source)
+{
while (!source.isEmpty()) {
UChar cc = source.currentChar();
- switch (entityState) {
+ switch (m_state) {
case Initial: {
- if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
- return false;
- if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
- return false;
if (cc == '#') {
- entityState = Number;
+ m_state = Numeric;
break;
}
- if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
- entityState = Named;
+ if (isAlphaNumeric(cc)) {
+ m_state = Named;
continue;
}
- return false;
+ return true;
}
- case Number: {
- if (cc == 'x') {
- entityState = MaybeHexLowerCaseX;
- break;
- }
- if (cc == 'X') {
- entityState = MaybeHexUpperCaseX;
+ case Numeric: {
+ if (cc == 'x' || cc == 'X') {
+ m_state = PossiblyHex;
break;
}
if (cc >= '0' && cc <= '9') {
- entityState = Decimal;
+ m_state = Decimal;
continue;
}
- source.push('#');
- return false;
+ return true;
}
- case MaybeHexLowerCaseX: {
+ case PossiblyHex: {
if (isHexDigit(cc)) {
- entityState = Hex;
+ m_state = Hex;
continue;
}
- source.push('#');
- source.push('x');
- return false;
- }
- case MaybeHexUpperCaseX: {
- if (isHexDigit(cc)) {
- entityState = Hex;
- continue;
- }
- source.push('#');
- source.push('X');
- return false;
+ return true;
}
case Hex: {
if (isHexDigit(cc)) {
- if (result != kInvalidUnicode)
- result = result * 16 + asHexDigit(cc);
- } else if (cc == ';') {
+ if (m_result != kInvalidUnicode)
+ m_result = m_result * 16 + asHexDigit(cc);
+ break;
+ }
+ if (cc == ';') {
source.advanceAndASSERT(cc);
- appendLegalEntityFor(result, decodedEntity);
- return true;
- } else {
- appendLegalEntityFor(result, decodedEntity);
+ finalizeNumericEntity();
return true;
}
- break;
+ return true;
}
case Decimal: {
if (cc >= '0' && cc <= '9') {
- if (result != kInvalidUnicode)
- result = result * 10 + cc - '0';
- } else if (cc == ';') {
+ if (m_result != kInvalidUnicode)
+ m_result = m_result * 10 + cc - '0';
+ break;
+ }
+ if (cc == ';') {
source.advanceAndASSERT(cc);
- appendLegalEntityFor(result, decodedEntity);
- return true;
- } else {
- appendLegalEntityFor(result, decodedEntity);
+ finalizeNumericEntity();
return true;
}
- break;
+ return true;
}
case Named: {
- return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);
+ if (isAlphaNumeric(cc))
+ break;
+ if (cc == ';') {
+ source.advanceAndASSERT(cc);
+ finalizeNamedEntity();
+ return true;
+ }
+ return true;
}
}
- if (result > UCHAR_MAX_VALUE)
- result = kInvalidUnicode;
+ if (m_result > UCHAR_MAX_VALUE)
+ m_result = kInvalidUnicode;
- consumedCharacters.append(cc);
+ m_buffer.append(cc);
source.advanceAndASSERT(cc);
}
ASSERT(source.isEmpty());
- notEnoughCharacters = true;
- unconsumeCharacters(source, consumedCharacters);
return false;
}
-static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
+void HTMLEntityParser::finalizeNumericEntity()
{
- if (U_IS_BMP(value)) {
- UChar character = static_cast(value);
- ASSERT(character == value);
- result[0] = character;
- return 1;
+ m_buffer.clear();
+ if (m_result <= 0 || m_result > 0x10FFFF || (m_result >= 0xD800 && m_result <= 0xDFFF)) {
+ m_buffer.append(replacementCharacter);
+ } else if (U_IS_BMP(m_result)) {
+ m_buffer.append(m_result);
+ } else {
+ m_buffer.append(U16_LEAD(m_result));
+ m_buffer.append(U16_TRAIL(m_result));
}
-
- result[0] = U16_LEAD(value);
- result[1] = U16_TRAIL(value);
- return 2;
}
-size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
+void HTMLEntityParser::finalizeNamedEntity()
{
- HTMLEntitySearch search;
- while (*name) {
- search.advance(*name++);
- if (!search.isEntityPrefix())
- return 0;
- }
- search.advance(';');
- if (!search.isEntityPrefix())
- return 0;
-
- size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
- if (!search.mostRecentMatch()->secondValue)
- return numberOfCodePoints;
- return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
+ UChar decodedEntity = decodeEntity(m_buffer);
+ m_buffer.clear();
+ m_buffer.append(decodedEntity);
}
} // namespace blink
diff --git a/engine/core/html/parser/HTMLEntityParser.h b/engine/core/html/parser/HTMLEntityParser.h
index 42df842aa564d2d9afc6bc43552e77dc6aa7b30d..e146c4ec6ff1e60f94cedc844a5db865f90df61e 100644
--- a/engine/core/html/parser/HTMLEntityParser.h
+++ b/engine/core/html/parser/HTMLEntityParser.h
@@ -31,40 +31,35 @@
namespace blink {
-class DecodedHTMLEntity {
-private:
- // HTML entities contain at most four UTF-16 code units.
- static const unsigned kMaxLength = 4;
-
+class HTMLEntityParser {
public:
- DecodedHTMLEntity() : length(0) { }
+ typedef Vector OutputBuffer;
- bool isEmpty() const { return !length; }
+ HTMLEntityParser();
+ ~HTMLEntityParser();
- void append(UChar c)
- {
- RELEASE_ASSERT(length < kMaxLength);
- data[length++] = c;
- }
+ void reset();
+ bool parse(SegmentedString&);
- void append(UChar32 c)
- {
- if (U_IS_BMP(c)) {
- append(static_cast(c));
- return;
- }
- append(U16_LEAD(c));
- append(U16_TRAIL(c));
- }
+ const OutputBuffer& result() const { return m_buffer; }
- unsigned length;
- UChar data[kMaxLength];
-};
+private:
+ enum EntityState {
+ Initial,
+ Numeric,
+ PossiblyHex,
+ Hex,
+ Decimal,
+ Named
+ };
-bool consumeHTMLEntity(SegmentedString&, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter = '\0');
+ void finalizeNumericEntity();
+ void finalizeNamedEntity();
-// Used by the XML parser. Not suitable for use in HTML parsing. Use consumeHTMLEntity instead.
-size_t decodeNamedEntityToUCharArray(const char*, UChar result[4]);
+ EntityState m_state;
+ UChar32 m_result;
+ OutputBuffer m_buffer;
+};
}
diff --git a/engine/core/html/parser/HTMLTokenizer.cpp b/engine/core/html/parser/HTMLTokenizer.cpp
index 7e765b277ea71084fe31a34ad8f7308930a44c34..ca454ad3cd314b958735a106c6b87d0ed7cb5b85 100644
--- a/engine/core/html/parser/HTMLTokenizer.cpp
+++ b/engine/core/html/parser/HTMLTokenizer.cpp
@@ -111,24 +111,6 @@ void HTMLTokenizer::reset()
{
m_state = HTMLTokenizer::DataState;
m_token = 0;
- m_additionalAllowedCharacter = '\0';
-}
-
-inline bool HTMLTokenizer::processEntity(SegmentedString& source)
-{
- bool notEnoughCharacters = false;
- DecodedHTMLEntity decodedEntity;
- bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
- if (notEnoughCharacters)
- return false;
- if (!success) {
- ASSERT(decodedEntity.isEmpty());
- bufferCharacter('&');
- } else {
- for (unsigned i = 0; i < decodedEntity.length; ++i)
- bufferCharacter(decodedEntity.data[i]);
- }
- return true;
}
bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
@@ -146,7 +128,7 @@ bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
#define FLUSH_AND_ADVANCE_TO(stateName) \
do { \
- m_state = HTMLTokenizer::stateName; \
+ m_state = HTMLTokenizer::stateName; \
if (flushBufferedEndTag(source)) \
return true; \
if (source.isEmpty() \
@@ -190,9 +172,11 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
// Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
switch (m_state) {
HTML_BEGIN_STATE(DataState) {
- if (cc == '&')
+ if (cc == '&') {
+ m_returnState = DataState;
+ m_entityParser.reset();
HTML_ADVANCE_TO(CharacterReferenceInDataState);
- else if (cc == '<') {
+ } else if (cc == '<') {
if (m_token->type() == HTMLToken::Character) {
// We have a bunch of character tokens queued up that we
// are emitting lazily here.
@@ -209,12 +193,34 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
END_STATE()
HTML_BEGIN_STATE(CharacterReferenceInDataState) {
- if (!processEntity(source))
+ if (!m_entityParser.parse(source))
return haveBufferedCharacterToken();
+ for (const UChar& entityCharacter : m_entityParser.result())
+ bufferCharacter(entityCharacter);
+ cc = m_inputStreamPreprocessor.nextInputCharacter();
+ ASSERT(m_returnState == m_returnState);
HTML_SWITCH_TO(DataState);
}
END_STATE()
+ HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
+ if (!m_entityParser.parse(source))
+ return haveBufferedCharacterToken();
+ for (const UChar& entityCharacter : m_entityParser.result())
+ m_token->appendToAttributeValue(entityCharacter);
+ cc = m_inputStreamPreprocessor.nextInputCharacter();
+
+ if (m_returnState == AttributeValueDoubleQuotedState)
+ HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
+ else if (m_returnState == AttributeValueSingleQuotedState)
+ HTML_SWITCH_TO(AttributeValueSingleQuotedState);
+ else if (m_returnState == AttributeValueUnquotedState)
+ HTML_SWITCH_TO(AttributeValueUnquotedState);
+ else
+ ASSERT_NOT_REACHED();
+ }
+ END_STATE()
+
HTML_BEGIN_STATE(RAWTEXTState) {
if (cc == '<')
HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
@@ -477,7 +483,8 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
m_token->endAttributeValue(source.numberOfCharactersConsumed());
HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
} else if (cc == '&') {
- m_additionalAllowedCharacter = '"';
+ m_returnState = AttributeValueDoubleQuotedState;
+ m_entityParser.reset();
HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
} else if (cc == kEndOfFileMarker) {
parseError();
@@ -495,7 +502,8 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
m_token->endAttributeValue(source.numberOfCharactersConsumed());
HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
} else if (cc == '&') {
- m_additionalAllowedCharacter = '\'';
+ m_returnState = AttributeValueSingleQuotedState;
+ m_entityParser.reset();
HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
} else if (cc == kEndOfFileMarker) {
parseError();
@@ -513,7 +521,8 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
m_token->endAttributeValue(source.numberOfCharactersConsumed());
HTML_ADVANCE_TO(BeforeAttributeNameState);
} else if (cc == '&') {
- m_additionalAllowedCharacter = '>';
+ m_returnState = AttributeValueUnquotedState;
+ m_entityParser.reset();
HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
} else if (cc == '>') {
m_token->endAttributeValue(source.numberOfCharactersConsumed());
@@ -531,34 +540,6 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
}
END_STATE()
- HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
- bool notEnoughCharacters = false;
- DecodedHTMLEntity decodedEntity;
- bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
- if (notEnoughCharacters)
- return haveBufferedCharacterToken();
- if (!success) {
- ASSERT(decodedEntity.isEmpty());
- m_token->appendToAttributeValue('&');
- } else {
- for (unsigned i = 0; i < decodedEntity.length; ++i)
- m_token->appendToAttributeValue(decodedEntity.data[i]);
- }
- // We're supposed to switch back to the attribute value state that
- // we were in when we were switched into this state. Rather than
- // keeping track of this explictly, we observe that the previous
- // state can be determined by m_additionalAllowedCharacter.
- if (m_additionalAllowedCharacter == '"')
- HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
- else if (m_additionalAllowedCharacter == '\'')
- HTML_SWITCH_TO(AttributeValueSingleQuotedState);
- else if (m_additionalAllowedCharacter == '>')
- HTML_SWITCH_TO(AttributeValueUnquotedState);
- else
- ASSERT_NOT_REACHED();
- }
- END_STATE()
-
HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
if (isTokenizerWhitespace(cc))
HTML_ADVANCE_TO(BeforeAttributeNameState);
diff --git a/engine/core/html/parser/HTMLTokenizer.h b/engine/core/html/parser/HTMLTokenizer.h
index 302b6482e1c2b58500599708fd350a23f3fb2b14..a22cf50df65ecf530b9d1f0b6ea2b889945ebcdc 100644
--- a/engine/core/html/parser/HTMLTokenizer.h
+++ b/engine/core/html/parser/HTMLTokenizer.h
@@ -27,6 +27,7 @@
#ifndef HTMLTokenizer_h
#define HTMLTokenizer_h
+#include "core/html/parser/HTMLEntityParser.h"
#include "core/html/parser/HTMLToken.h"
#include "core/html/parser/InputStreamPreprocessor.h"
#include "platform/text/SegmentedString.h"
@@ -45,6 +46,7 @@ public:
enum State {
DataState,
CharacterReferenceInDataState,
+ CharacterReferenceInAttributeValueState,
RAWTEXTState,
TagOpenState,
EndTagOpenState,
@@ -59,7 +61,6 @@ public:
AttributeValueDoubleQuotedState,
AttributeValueSingleQuotedState,
AttributeValueUnquotedState,
- CharacterReferenceInAttributeValueState,
AfterAttributeValueQuotedState,
SelfClosingStartTagState,
BogusCommentState,
@@ -87,8 +88,6 @@ public:
private:
HTMLTokenizer();
- inline bool processEntity(SegmentedString&);
-
inline void parseError();
inline void bufferCharacter(UChar character)
@@ -156,11 +155,11 @@ private:
// this member might be pointing to unallocated memory.
HTMLToken* m_token;
- // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
- UChar m_additionalAllowedCharacter;
+ State m_returnState;
// http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
InputStreamPreprocessor m_inputStreamPreprocessor;
+ HTMLEntityParser m_entityParser;
Vector m_appropriateEndTagName;
diff --git a/tests/parser/entity-expected.txt b/tests/parser/entity-expected.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b13a39b09cc8599df8184b7bf421fb47e9306452
--- /dev/null
+++ b/tests/parser/entity-expected.txt
@@ -0,0 +1,19 @@
+
+
+
+aaa'bbb
+<
+aaa�bbb
+A
+&
+&#
+&#x
+A
+&;
+&#;
+&#x;
+A
+�
+
+
+
diff --git a/tests/parser/entity.html b/tests/parser/entity.html
new file mode 100644
index 0000000000000000000000000000000000000000..0abc6a965c9d87f1c4a7d12f9bddb084d545519e
--- /dev/null
+++ b/tests/parser/entity.html
@@ -0,0 +1,18 @@
+
+
+
+aaa'bbb
+<
+aaa&xxx;bbb
+A
+&
+
+
+A
+&;
+
+
+A
+
+
+
diff --git a/tests/resources/dump-as-markup.html b/tests/resources/dump-as-markup.html
new file mode 100644
index 0000000000000000000000000000000000000000..81e849faf859a051d12a3109ec9bd336fa9924cc
--- /dev/null
+++ b/tests/resources/dump-as-markup.html
@@ -0,0 +1,5 @@
+