452 lines
12 KiB
JavaScript
452 lines
12 KiB
JavaScript
'use strict'
|
||
|
||
var legacy = require('character-entities-legacy')
|
||
var invalid = require('character-reference-invalid')
|
||
var decimal = require('is-decimal')
|
||
var hexadecimal = require('is-hexadecimal')
|
||
var alphanumerical = require('is-alphanumerical')
|
||
var decodeEntity = require('./decode-entity')
|
||
|
||
module.exports = parseEntities
|
||
|
||
var own = {}.hasOwnProperty
|
||
var fromCharCode = String.fromCharCode
|
||
var noop = Function.prototype
|
||
|
||
// Default settings.
|
||
var defaults = {
|
||
warning: null,
|
||
reference: null,
|
||
text: null,
|
||
warningContext: null,
|
||
referenceContext: null,
|
||
textContext: null,
|
||
position: {},
|
||
additional: null,
|
||
attribute: false,
|
||
nonTerminated: true
|
||
}
|
||
|
||
// Characters.
|
||
var tab = 9 // '\t'
|
||
var lineFeed = 10 // '\n'
|
||
var formFeed = 12 // '\f'
|
||
var space = 32 // ' '
|
||
var ampersand = 38 // '&'
|
||
var semicolon = 59 // ';'
|
||
var lessThan = 60 // '<'
|
||
var equalsTo = 61 // '='
|
||
var numberSign = 35 // '#'
|
||
var uppercaseX = 88 // 'X'
|
||
var lowercaseX = 120 // 'x'
|
||
var replacementCharacter = 65533 // '<27>'
|
||
|
||
// Reference types.
|
||
var name = 'named'
|
||
var hexa = 'hexadecimal'
|
||
var deci = 'decimal'
|
||
|
||
// Map of bases.
|
||
var bases = {}
|
||
|
||
bases[hexa] = 16
|
||
bases[deci] = 10
|
||
|
||
// Map of types to tests.
|
||
// Each type of character reference accepts different characters.
|
||
// This test is used to detect whether a reference has ended (as the semicolon
|
||
// is not strictly needed).
|
||
var tests = {}
|
||
|
||
tests[name] = alphanumerical
|
||
tests[deci] = decimal
|
||
tests[hexa] = hexadecimal
|
||
|
||
// Warning types.
|
||
var namedNotTerminated = 1
|
||
var numericNotTerminated = 2
|
||
var namedEmpty = 3
|
||
var numericEmpty = 4
|
||
var namedUnknown = 5
|
||
var numericDisallowed = 6
|
||
var numericProhibited = 7
|
||
|
||
// Warning messages.
|
||
var messages = {}
|
||
|
||
messages[namedNotTerminated] =
|
||
'Named character references must be terminated by a semicolon'
|
||
messages[numericNotTerminated] =
|
||
'Numeric character references must be terminated by a semicolon'
|
||
messages[namedEmpty] = 'Named character references cannot be empty'
|
||
messages[numericEmpty] = 'Numeric character references cannot be empty'
|
||
messages[namedUnknown] = 'Named character references must be known'
|
||
messages[numericDisallowed] =
|
||
'Numeric character references cannot be disallowed'
|
||
messages[numericProhibited] =
|
||
'Numeric character references cannot be outside the permissible Unicode range'
|
||
|
||
// Wrap to ensure clean parameters are given to `parse`.
|
||
function parseEntities(value, options) {
|
||
var settings = {}
|
||
var option
|
||
var key
|
||
|
||
if (!options) {
|
||
options = {}
|
||
}
|
||
|
||
for (key in defaults) {
|
||
option = options[key]
|
||
settings[key] =
|
||
option === null || option === undefined ? defaults[key] : option
|
||
}
|
||
|
||
if (settings.position.indent || settings.position.start) {
|
||
settings.indent = settings.position.indent || []
|
||
settings.position = settings.position.start
|
||
}
|
||
|
||
return parse(value, settings)
|
||
}
|
||
|
||
// Parse entities.
|
||
// eslint-disable-next-line complexity
|
||
function parse(value, settings) {
|
||
var additional = settings.additional
|
||
var nonTerminated = settings.nonTerminated
|
||
var handleText = settings.text
|
||
var handleReference = settings.reference
|
||
var handleWarning = settings.warning
|
||
var textContext = settings.textContext
|
||
var referenceContext = settings.referenceContext
|
||
var warningContext = settings.warningContext
|
||
var pos = settings.position
|
||
var indent = settings.indent || []
|
||
var length = value.length
|
||
var index = 0
|
||
var lines = -1
|
||
var column = pos.column || 1
|
||
var line = pos.line || 1
|
||
var queue = ''
|
||
var result = []
|
||
var entityCharacters
|
||
var namedEntity
|
||
var terminated
|
||
var characters
|
||
var character
|
||
var reference
|
||
var following
|
||
var warning
|
||
var reason
|
||
var output
|
||
var entity
|
||
var begin
|
||
var start
|
||
var type
|
||
var test
|
||
var prev
|
||
var next
|
||
var diff
|
||
var end
|
||
|
||
if (typeof additional === 'string') {
|
||
additional = additional.charCodeAt(0)
|
||
}
|
||
|
||
// Cache the current point.
|
||
prev = now()
|
||
|
||
// Wrap `handleWarning`.
|
||
warning = handleWarning ? parseError : noop
|
||
|
||
// Ensure the algorithm walks over the first character and the end
|
||
// (inclusive).
|
||
index--
|
||
length++
|
||
|
||
while (++index < length) {
|
||
// If the previous character was a newline.
|
||
if (character === lineFeed) {
|
||
column = indent[lines] || 1
|
||
}
|
||
|
||
character = value.charCodeAt(index)
|
||
|
||
if (character === ampersand) {
|
||
following = value.charCodeAt(index + 1)
|
||
|
||
// The behaviour depends on the identity of the next character.
|
||
if (
|
||
following === tab ||
|
||
following === lineFeed ||
|
||
following === formFeed ||
|
||
following === space ||
|
||
following === ampersand ||
|
||
following === lessThan ||
|
||
following !== following ||
|
||
(additional && following === additional)
|
||
) {
|
||
// Not a character reference.
|
||
// No characters are consumed, and nothing is returned.
|
||
// This is not an error, either.
|
||
queue += fromCharCode(character)
|
||
column++
|
||
|
||
continue
|
||
}
|
||
|
||
start = index + 1
|
||
begin = start
|
||
end = start
|
||
|
||
if (following === numberSign) {
|
||
// Numerical entity.
|
||
end = ++begin
|
||
|
||
// The behaviour further depends on the next character.
|
||
following = value.charCodeAt(end)
|
||
|
||
if (following === uppercaseX || following === lowercaseX) {
|
||
// ASCII hex digits.
|
||
type = hexa
|
||
end = ++begin
|
||
} else {
|
||
// ASCII digits.
|
||
type = deci
|
||
}
|
||
} else {
|
||
// Named entity.
|
||
type = name
|
||
}
|
||
|
||
entityCharacters = ''
|
||
entity = ''
|
||
characters = ''
|
||
test = tests[type]
|
||
end--
|
||
|
||
while (++end < length) {
|
||
following = value.charCodeAt(end)
|
||
|
||
if (!test(following)) {
|
||
break
|
||
}
|
||
|
||
characters += fromCharCode(following)
|
||
|
||
// Check if we can match a legacy named reference.
|
||
// If so, we cache that as the last viable named reference.
|
||
// This ensures we do not need to walk backwards later.
|
||
if (type === name && own.call(legacy, characters)) {
|
||
entityCharacters = characters
|
||
entity = legacy[characters]
|
||
}
|
||
}
|
||
|
||
terminated = value.charCodeAt(end) === semicolon
|
||
|
||
if (terminated) {
|
||
end++
|
||
|
||
namedEntity = type === name ? decodeEntity(characters) : false
|
||
|
||
if (namedEntity) {
|
||
entityCharacters = characters
|
||
entity = namedEntity
|
||
}
|
||
}
|
||
|
||
diff = 1 + end - start
|
||
|
||
if (!terminated && !nonTerminated) {
|
||
// Empty.
|
||
} else if (!characters) {
|
||
// An empty (possible) entity is valid, unless it’s numeric (thus an
|
||
// ampersand followed by an octothorp).
|
||
if (type !== name) {
|
||
warning(numericEmpty, diff)
|
||
}
|
||
} else if (type === name) {
|
||
// An ampersand followed by anything unknown, and not terminated, is
|
||
// invalid.
|
||
if (terminated && !entity) {
|
||
warning(namedUnknown, 1)
|
||
} else {
|
||
// If theres something after an entity name which is not known, cap
|
||
// the reference.
|
||
if (entityCharacters !== characters) {
|
||
end = begin + entityCharacters.length
|
||
diff = 1 + end - begin
|
||
terminated = false
|
||
}
|
||
|
||
// If the reference is not terminated, warn.
|
||
if (!terminated) {
|
||
reason = entityCharacters ? namedNotTerminated : namedEmpty
|
||
|
||
if (settings.attribute) {
|
||
following = value.charCodeAt(end)
|
||
|
||
if (following === equalsTo) {
|
||
warning(reason, diff)
|
||
entity = null
|
||
} else if (alphanumerical(following)) {
|
||
entity = null
|
||
} else {
|
||
warning(reason, diff)
|
||
}
|
||
} else {
|
||
warning(reason, diff)
|
||
}
|
||
}
|
||
}
|
||
|
||
reference = entity
|
||
} else {
|
||
if (!terminated) {
|
||
// All non-terminated numeric entities are not rendered, and trigger a
|
||
// warning.
|
||
warning(numericNotTerminated, diff)
|
||
}
|
||
|
||
// When terminated and number, parse as either hexadecimal or decimal.
|
||
reference = parseInt(characters, bases[type])
|
||
|
||
// Trigger a warning when the parsed number is prohibited, and replace
|
||
// with replacement character.
|
||
if (prohibited(reference)) {
|
||
warning(numericProhibited, diff)
|
||
reference = fromCharCode(replacementCharacter)
|
||
} else if (reference in invalid) {
|
||
// Trigger a warning when the parsed number is disallowed, and replace
|
||
// by an alternative.
|
||
warning(numericDisallowed, diff)
|
||
reference = invalid[reference]
|
||
} else {
|
||
// Parse the number.
|
||
output = ''
|
||
|
||
// Trigger a warning when the parsed number should not be used.
|
||
if (disallowed(reference)) {
|
||
warning(numericDisallowed, diff)
|
||
}
|
||
|
||
// Stringify the number.
|
||
if (reference > 0xffff) {
|
||
reference -= 0x10000
|
||
output += fromCharCode((reference >>> (10 & 0x3ff)) | 0xd800)
|
||
reference = 0xdc00 | (reference & 0x3ff)
|
||
}
|
||
|
||
reference = output + fromCharCode(reference)
|
||
}
|
||
}
|
||
|
||
// Found it!
|
||
// First eat the queued characters as normal text, then eat an entity.
|
||
if (reference) {
|
||
flush()
|
||
|
||
prev = now()
|
||
index = end - 1
|
||
column += end - start + 1
|
||
result.push(reference)
|
||
next = now()
|
||
next.offset++
|
||
|
||
if (handleReference) {
|
||
handleReference.call(
|
||
referenceContext,
|
||
reference,
|
||
{start: prev, end: next},
|
||
value.slice(start - 1, end)
|
||
)
|
||
}
|
||
|
||
prev = next
|
||
} else {
|
||
// If we could not find a reference, queue the checked characters (as
|
||
// normal characters), and move the pointer to their end.
|
||
// This is possible because we can be certain neither newlines nor
|
||
// ampersands are included.
|
||
characters = value.slice(start - 1, end)
|
||
queue += characters
|
||
column += characters.length
|
||
index = end - 1
|
||
}
|
||
} else {
|
||
// Handle anything other than an ampersand, including newlines and EOF.
|
||
if (
|
||
character === 10 // Line feed
|
||
) {
|
||
line++
|
||
lines++
|
||
column = 0
|
||
}
|
||
|
||
if (character === character) {
|
||
queue += fromCharCode(character)
|
||
column++
|
||
} else {
|
||
flush()
|
||
}
|
||
}
|
||
}
|
||
|
||
// Return the reduced nodes.
|
||
return result.join('')
|
||
|
||
// Get current position.
|
||
function now() {
|
||
return {
|
||
line: line,
|
||
column: column,
|
||
offset: index + (pos.offset || 0)
|
||
}
|
||
}
|
||
|
||
// “Throw” a parse-error: a warning.
|
||
function parseError(code, offset) {
|
||
var position = now()
|
||
|
||
position.column += offset
|
||
position.offset += offset
|
||
|
||
handleWarning.call(warningContext, messages[code], position, code)
|
||
}
|
||
|
||
// Flush `queue` (normal text).
|
||
// Macro invoked before each entity and at the end of `value`.
|
||
// Does nothing when `queue` is empty.
|
||
function flush() {
|
||
if (queue) {
|
||
result.push(queue)
|
||
|
||
if (handleText) {
|
||
handleText.call(textContext, queue, {start: prev, end: now()})
|
||
}
|
||
|
||
queue = ''
|
||
}
|
||
}
|
||
}
|
||
|
||
// Check if `character` is outside the permissible unicode range.
|
||
function prohibited(code) {
|
||
return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff
|
||
}
|
||
|
||
// Check if `character` is disallowed.
|
||
function disallowed(code) {
|
||
return (
|
||
(code >= 0x0001 && code <= 0x0008) ||
|
||
code === 0x000b ||
|
||
(code >= 0x000d && code <= 0x001f) ||
|
||
(code >= 0x007f && code <= 0x009f) ||
|
||
(code >= 0xfdd0 && code <= 0xfdef) ||
|
||
(code & 0xffff) === 0xffff ||
|
||
(code & 0xffff) === 0xfffe
|
||
)
|
||
}
|