提交 5d98345d 编写于 作者: A Adam Barth

Parse Sky entities according to the spec

R=eseidel@chromium.org

Review URL: https://codereview.chromium.org/678073002
上级 1bceb35c
/*
* Copyright (C) 2008 Apple Inc. All Rights Reserved.
* Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
* Copyright (C) 2010 Google, Inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "config.h"
#include "core/html/parser/HTMLEntityParser.h"
#include "core/html/parser/HTMLEntitySearch.h"
#include "core/html/parser/HTMLEntityTable.h"
#include "wtf/text/StringBuilder.h"
#include "wtf/unicode/CharacterNames.h"
using namespace WTF;
namespace blink {
static const UChar windowsLatin1ExtensionArray[32] = {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
};
static bool isAlphaNumeric(UChar cc)
{
return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
}
static const UChar32 kInvalidUnicode = -1;
static UChar adjustEntity(UChar32 value)
static UChar asHexDigit(UChar cc)
{
if ((value & ~0x1F) != 0x0080)
return value;
return windowsLatin1ExtensionArray[value - 0x80];
if (cc >= '0' && cc <= '9')
return cc - '0';
if (cc >= 'a' && cc <= 'f')
return 10 + cc - 'a';
if (cc >= 'A' && cc <= 'F')
return 10 + cc - 'A';
ASSERT_NOT_REACHED();
return 0;
}
static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
static bool isAlphaNumeric(UChar cc)
{
// FIXME: A number of specific entity values generate parse errors.
if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
decodedEntity.append(0xFFFD);
return;
}
if (U_IS_BMP(c)) {
decodedEntity.append(adjustEntity(c));
return;
}
decodedEntity.append(c);
return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
}
static const UChar32 kInvalidUnicode = -1;
static bool isHexDigit(UChar cc)
{
return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
}
static UChar asHexDigit(UChar cc)
static UChar decodeEntity(HTMLEntityParser::OutputBuffer buffer)
{
if (cc >= '0' && cc <= '9')
return cc - '0';
if (cc >= 'a' && cc <= 'z')
return 10 + cc - 'a';
if (cc >= 'A' && cc <= 'Z')
return 10 + cc - 'A';
ASSERT_NOT_REACHED();
return 0;
if (equalIgnoringNullity(buffer, "&amp"))
return '&';
if (equalIgnoringNullity(buffer, "&apos"))
return '\'';
if (equalIgnoringNullity(buffer, "&gt"))
return '>';
if (equalIgnoringNullity(buffer, "&lt"))
return '<';
if (equalIgnoringNullity(buffer, "&quot"))
return '"';
return replacementCharacter;
}
typedef Vector<UChar, 64> ConsumedCharacterBuffer;
static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)
HTMLEntityParser::HTMLEntityParser()
{
if (consumedCharacters.size() == 1)
source.push(consumedCharacters[0]);
else if (consumedCharacters.size() == 2) {
source.push(consumedCharacters[0]);
source.push(consumedCharacters[1]);
} else
source.prepend(SegmentedString(String(consumedCharacters)));
}
static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
HTMLEntityParser::~HTMLEntityParser()
{
ConsumedCharacterBuffer consumedCharacters;
HTMLEntitySearch entitySearch;
while (!source.isEmpty()) {
cc = source.currentChar();
entitySearch.advance(cc);
if (!entitySearch.isEntityPrefix())
break;
consumedCharacters.append(cc);
source.advanceAndASSERT(cc);
}
notEnoughCharacters = source.isEmpty();
if (notEnoughCharacters) {
// We can't decide on an entity because there might be a longer entity
// that we could match if we had more data.
unconsumeCharacters(source, consumedCharacters);
return false;
}
if (!entitySearch.mostRecentMatch()) {
unconsumeCharacters(source, consumedCharacters);
return false;
}
if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
// We've consumed too many characters. We need to walk the
// source back to the point at which we had consumed an
// actual entity.
unconsumeCharacters(source, consumedCharacters);
consumedCharacters.clear();
const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch();
const int length = mostRecent->length;
const LChar* reference = HTMLEntityTable::entityString(*mostRecent);
for (int i = 0; i < length; ++i) {
cc = source.currentChar();
ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++));
consumedCharacters.append(cc);
source.advanceAndASSERT(cc);
ASSERT(!source.isEmpty());
}
cc = source.currentChar();
}
if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
|| !additionalAllowedCharacter
|| !(isAlphaNumeric(cc) || cc == '=')) {
decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
decodedEntity.append(second);
return true;
}
unconsumeCharacters(source, consumedCharacters);
return false;
}
bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
void HTMLEntityParser::reset()
{
ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
ASSERT(!notEnoughCharacters);
ASSERT(decodedEntity.isEmpty());
enum EntityState {
Initial,
Number,
MaybeHexLowerCaseX,
MaybeHexUpperCaseX,
Hex,
Decimal,
Named
};
EntityState entityState = Initial;
UChar32 result = 0;
ConsumedCharacterBuffer consumedCharacters;
m_state = Initial;
m_result = '\0';
m_buffer.clear();
m_buffer.append('&');
}
bool HTMLEntityParser::parse(SegmentedString& source)
{
while (!source.isEmpty()) {
UChar cc = source.currentChar();
switch (entityState) {
switch (m_state) {
case Initial: {
if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
return false;
if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
return false;
if (cc == '#') {
entityState = Number;
m_state = Numeric;
break;
}
if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
entityState = Named;
if (isAlphaNumeric(cc)) {
m_state = Named;
continue;
}
return false;
return true;
}
case Number: {
if (cc == 'x') {
entityState = MaybeHexLowerCaseX;
break;
}
if (cc == 'X') {
entityState = MaybeHexUpperCaseX;
case Numeric: {
if (cc == 'x' || cc == 'X') {
m_state = PossiblyHex;
break;
}
if (cc >= '0' && cc <= '9') {
entityState = Decimal;
m_state = Decimal;
continue;
}
source.push('#');
return false;
return true;
}
case MaybeHexLowerCaseX: {
case PossiblyHex: {
if (isHexDigit(cc)) {
entityState = Hex;
m_state = Hex;
continue;
}
source.push('#');
source.push('x');
return false;
}
case MaybeHexUpperCaseX: {
if (isHexDigit(cc)) {
entityState = Hex;
continue;
}
source.push('#');
source.push('X');
return false;
return true;
}
case Hex: {
if (isHexDigit(cc)) {
if (result != kInvalidUnicode)
result = result * 16 + asHexDigit(cc);
} else if (cc == ';') {
if (m_result != kInvalidUnicode)
m_result = m_result * 16 + asHexDigit(cc);
break;
}
if (cc == ';') {
source.advanceAndASSERT(cc);
appendLegalEntityFor(result, decodedEntity);
return true;
} else {
appendLegalEntityFor(result, decodedEntity);
finalizeNumericEntity();
return true;
}
break;
return true;
}
case Decimal: {
if (cc >= '0' && cc <= '9') {
if (result != kInvalidUnicode)
result = result * 10 + cc - '0';
} else if (cc == ';') {
if (m_result != kInvalidUnicode)
m_result = m_result * 10 + cc - '0';
break;
}
if (cc == ';') {
source.advanceAndASSERT(cc);
appendLegalEntityFor(result, decodedEntity);
return true;
} else {
appendLegalEntityFor(result, decodedEntity);
finalizeNumericEntity();
return true;
}
break;
return true;
}
case Named: {
return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);
if (isAlphaNumeric(cc))
break;
if (cc == ';') {
source.advanceAndASSERT(cc);
finalizeNamedEntity();
return true;
}
return true;
}
}
if (result > UCHAR_MAX_VALUE)
result = kInvalidUnicode;
if (m_result > UCHAR_MAX_VALUE)
m_result = kInvalidUnicode;
consumedCharacters.append(cc);
m_buffer.append(cc);
source.advanceAndASSERT(cc);
}
ASSERT(source.isEmpty());
notEnoughCharacters = true;
unconsumeCharacters(source, consumedCharacters);
return false;
}
static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
void HTMLEntityParser::finalizeNumericEntity()
{
if (U_IS_BMP(value)) {
UChar character = static_cast<UChar>(value);
ASSERT(character == value);
result[0] = character;
return 1;
m_buffer.clear();
if (m_result <= 0 || m_result > 0x10FFFF || (m_result >= 0xD800 && m_result <= 0xDFFF)) {
m_buffer.append(replacementCharacter);
} else if (U_IS_BMP(m_result)) {
m_buffer.append(m_result);
} else {
m_buffer.append(U16_LEAD(m_result));
m_buffer.append(U16_TRAIL(m_result));
}
result[0] = U16_LEAD(value);
result[1] = U16_TRAIL(value);
return 2;
}
size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
void HTMLEntityParser::finalizeNamedEntity()
{
HTMLEntitySearch search;
while (*name) {
search.advance(*name++);
if (!search.isEntityPrefix())
return 0;
}
search.advance(';');
if (!search.isEntityPrefix())
return 0;
size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
if (!search.mostRecentMatch()->secondValue)
return numberOfCodePoints;
return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
UChar decodedEntity = decodeEntity(m_buffer);
m_buffer.clear();
m_buffer.append(decodedEntity);
}
} // namespace blink
......@@ -31,40 +31,35 @@
namespace blink {
class DecodedHTMLEntity {
private:
// HTML entities contain at most four UTF-16 code units.
static const unsigned kMaxLength = 4;
class HTMLEntityParser {
public:
DecodedHTMLEntity() : length(0) { }
typedef Vector<UChar, 32> OutputBuffer;
bool isEmpty() const { return !length; }
HTMLEntityParser();
~HTMLEntityParser();
void append(UChar c)
{
RELEASE_ASSERT(length < kMaxLength);
data[length++] = c;
}
void reset();
bool parse(SegmentedString&);
void append(UChar32 c)
{
if (U_IS_BMP(c)) {
append(static_cast<UChar>(c));
return;
}
append(U16_LEAD(c));
append(U16_TRAIL(c));
}
const OutputBuffer& result() const { return m_buffer; }
unsigned length;
UChar data[kMaxLength];
};
private:
enum EntityState {
Initial,
Numeric,
PossiblyHex,
Hex,
Decimal,
Named
};
bool consumeHTMLEntity(SegmentedString&, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter = '\0');
void finalizeNumericEntity();
void finalizeNamedEntity();
// Used by the XML parser. Not suitable for use in HTML parsing. Use consumeHTMLEntity instead.
size_t decodeNamedEntityToUCharArray(const char*, UChar result[4]);
EntityState m_state;
UChar32 m_result;
OutputBuffer m_buffer;
};
}
......
......@@ -111,24 +111,6 @@ void HTMLTokenizer::reset()
{
m_state = HTMLTokenizer::DataState;
m_token = 0;
m_additionalAllowedCharacter = '\0';
}
inline bool HTMLTokenizer::processEntity(SegmentedString& source)
{
bool notEnoughCharacters = false;
DecodedHTMLEntity decodedEntity;
bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
if (notEnoughCharacters)
return false;
if (!success) {
ASSERT(decodedEntity.isEmpty());
bufferCharacter('&');
} else {
for (unsigned i = 0; i < decodedEntity.length; ++i)
bufferCharacter(decodedEntity.data[i]);
}
return true;
}
bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
......@@ -146,7 +128,7 @@ bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
#define FLUSH_AND_ADVANCE_TO(stateName) \
do { \
m_state = HTMLTokenizer::stateName; \
m_state = HTMLTokenizer::stateName; \
if (flushBufferedEndTag(source)) \
return true; \
if (source.isEmpty() \
......@@ -190,9 +172,11 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
// Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
switch (m_state) {
HTML_BEGIN_STATE(DataState) {
if (cc == '&')
if (cc == '&') {
m_returnState = DataState;
m_entityParser.reset();
HTML_ADVANCE_TO(CharacterReferenceInDataState);
else if (cc == '<') {
} else if (cc == '<') {
if (m_token->type() == HTMLToken::Character) {
// We have a bunch of character tokens queued up that we
// are emitting lazily here.
......@@ -209,12 +193,34 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
END_STATE()
HTML_BEGIN_STATE(CharacterReferenceInDataState) {
if (!processEntity(source))
if (!m_entityParser.parse(source))
return haveBufferedCharacterToken();
for (const UChar& entityCharacter : m_entityParser.result())
bufferCharacter(entityCharacter);
cc = m_inputStreamPreprocessor.nextInputCharacter();
ASSERT(m_returnState == m_returnState);
HTML_SWITCH_TO(DataState);
}
END_STATE()
HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
if (!m_entityParser.parse(source))
return haveBufferedCharacterToken();
for (const UChar& entityCharacter : m_entityParser.result())
m_token->appendToAttributeValue(entityCharacter);
cc = m_inputStreamPreprocessor.nextInputCharacter();
if (m_returnState == AttributeValueDoubleQuotedState)
HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
else if (m_returnState == AttributeValueSingleQuotedState)
HTML_SWITCH_TO(AttributeValueSingleQuotedState);
else if (m_returnState == AttributeValueUnquotedState)
HTML_SWITCH_TO(AttributeValueUnquotedState);
else
ASSERT_NOT_REACHED();
}
END_STATE()
HTML_BEGIN_STATE(RAWTEXTState) {
if (cc == '<')
HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
......@@ -477,7 +483,8 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
m_token->endAttributeValue(source.numberOfCharactersConsumed());
HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
} else if (cc == '&') {
m_additionalAllowedCharacter = '"';
m_returnState = AttributeValueDoubleQuotedState;
m_entityParser.reset();
HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
} else if (cc == kEndOfFileMarker) {
parseError();
......@@ -495,7 +502,8 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
m_token->endAttributeValue(source.numberOfCharactersConsumed());
HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
} else if (cc == '&') {
m_additionalAllowedCharacter = '\'';
m_returnState = AttributeValueSingleQuotedState;
m_entityParser.reset();
HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
} else if (cc == kEndOfFileMarker) {
parseError();
......@@ -513,7 +521,8 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
m_token->endAttributeValue(source.numberOfCharactersConsumed());
HTML_ADVANCE_TO(BeforeAttributeNameState);
} else if (cc == '&') {
m_additionalAllowedCharacter = '>';
m_returnState = AttributeValueUnquotedState;
m_entityParser.reset();
HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
} else if (cc == '>') {
m_token->endAttributeValue(source.numberOfCharactersConsumed());
......@@ -531,34 +540,6 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
}
END_STATE()
HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
bool notEnoughCharacters = false;
DecodedHTMLEntity decodedEntity;
bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
if (notEnoughCharacters)
return haveBufferedCharacterToken();
if (!success) {
ASSERT(decodedEntity.isEmpty());
m_token->appendToAttributeValue('&');
} else {
for (unsigned i = 0; i < decodedEntity.length; ++i)
m_token->appendToAttributeValue(decodedEntity.data[i]);
}
// We're supposed to switch back to the attribute value state that
// we were in when we were switched into this state. Rather than
// keeping track of this explictly, we observe that the previous
// state can be determined by m_additionalAllowedCharacter.
if (m_additionalAllowedCharacter == '"')
HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
else if (m_additionalAllowedCharacter == '\'')
HTML_SWITCH_TO(AttributeValueSingleQuotedState);
else if (m_additionalAllowedCharacter == '>')
HTML_SWITCH_TO(AttributeValueUnquotedState);
else
ASSERT_NOT_REACHED();
}
END_STATE()
HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
if (isTokenizerWhitespace(cc))
HTML_ADVANCE_TO(BeforeAttributeNameState);
......
......@@ -27,6 +27,7 @@
#ifndef HTMLTokenizer_h
#define HTMLTokenizer_h
#include "core/html/parser/HTMLEntityParser.h"
#include "core/html/parser/HTMLToken.h"
#include "core/html/parser/InputStreamPreprocessor.h"
#include "platform/text/SegmentedString.h"
......@@ -45,6 +46,7 @@ public:
enum State {
DataState,
CharacterReferenceInDataState,
CharacterReferenceInAttributeValueState,
RAWTEXTState,
TagOpenState,
EndTagOpenState,
......@@ -59,7 +61,6 @@ public:
AttributeValueDoubleQuotedState,
AttributeValueSingleQuotedState,
AttributeValueUnquotedState,
CharacterReferenceInAttributeValueState,
AfterAttributeValueQuotedState,
SelfClosingStartTagState,
BogusCommentState,
......@@ -87,8 +88,6 @@ public:
private:
HTMLTokenizer();
inline bool processEntity(SegmentedString&);
inline void parseError();
inline void bufferCharacter(UChar character)
......@@ -156,11 +155,11 @@ private:
// this member might be pointing to unallocated memory.
HTMLToken* m_token;
// http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
UChar m_additionalAllowedCharacter;
State m_returnState;
// http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
InputStreamPreprocessor<HTMLTokenizer> m_inputStreamPreprocessor;
HTMLEntityParser m_entityParser;
Vector<UChar, 32> m_appropriateEndTagName;
......
<html>
<link rel="import" href="../resources/dump-as-markup.html"></link>
<body>
<div>aaa'bbb</div>
<div>&lt;</div>
<div>aaa�bbb</div>
<div foo="bar ">A</div>
<div>&amp;</div>
<div>&amp;#</div>
<div>&amp;#x</div>
<div>&amp;#x41</div>
<div>&amp;;</div>
<div>&amp;#;</div>
<div>&amp;#x;</div>
<div>A</div>
<div>�</div>
</body>
</html>
<html>
<link rel="import" href="../resources/dump-as-markup.html" />
<body>
<div>aaa&apos;bbb</div>
<div>&lt;</div>
<div>aaa&xxx;bbb</div>
<div foo="bar&#x20;">&#65;</div>
<div>&</div>
<div>&#</div>
<div>&#x</div>
<div>&#x41</div>
<div>&;</div>
<div>&#;</div>
<div>&#x;</div>
<div>&#x41;</div>
<div>&#x0;</div>
</body>
</html>
<script>
window.addEventListener('load', function() {
internals.notifyTestComplete(internals.contentAsMarkup());
});
</script>
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册