forked from lix-project/lix-website
452 lines
12 KiB
JavaScript
452 lines
12 KiB
JavaScript
|
'use strict'
|
|||
|
|
|||
|
var legacy = require('character-entities-legacy')
|
|||
|
var invalid = require('character-reference-invalid')
|
|||
|
var decimal = require('is-decimal')
|
|||
|
var hexadecimal = require('is-hexadecimal')
|
|||
|
var alphanumerical = require('is-alphanumerical')
|
|||
|
var decodeEntity = require('./decode-entity')
|
|||
|
|
|||
|
module.exports = parseEntities
|
|||
|
|
|||
|
var own = {}.hasOwnProperty
|
|||
|
var fromCharCode = String.fromCharCode
|
|||
|
var noop = Function.prototype
|
|||
|
|
|||
|
// Default settings.
|
|||
|
var defaults = {
|
|||
|
warning: null,
|
|||
|
reference: null,
|
|||
|
text: null,
|
|||
|
warningContext: null,
|
|||
|
referenceContext: null,
|
|||
|
textContext: null,
|
|||
|
position: {},
|
|||
|
additional: null,
|
|||
|
attribute: false,
|
|||
|
nonTerminated: true
|
|||
|
}
|
|||
|
|
|||
|
// Characters.
|
|||
|
var tab = 9 // '\t'
|
|||
|
var lineFeed = 10 // '\n'
|
|||
|
var formFeed = 12 // '\f'
|
|||
|
var space = 32 // ' '
|
|||
|
var ampersand = 38 // '&'
|
|||
|
var semicolon = 59 // ';'
|
|||
|
var lessThan = 60 // '<'
|
|||
|
var equalsTo = 61 // '='
|
|||
|
var numberSign = 35 // '#'
|
|||
|
var uppercaseX = 88 // 'X'
|
|||
|
var lowercaseX = 120 // 'x'
|
|||
|
var replacementCharacter = 65533 // '<27>'
|
|||
|
|
|||
|
// Reference types.
|
|||
|
var name = 'named'
|
|||
|
var hexa = 'hexadecimal'
|
|||
|
var deci = 'decimal'
|
|||
|
|
|||
|
// Map of bases.
|
|||
|
var bases = {}
|
|||
|
|
|||
|
bases[hexa] = 16
|
|||
|
bases[deci] = 10
|
|||
|
|
|||
|
// Map of types to tests.
|
|||
|
// Each type of character reference accepts different characters.
|
|||
|
// This test is used to detect whether a reference has ended (as the semicolon
|
|||
|
// is not strictly needed).
|
|||
|
var tests = {}
|
|||
|
|
|||
|
tests[name] = alphanumerical
|
|||
|
tests[deci] = decimal
|
|||
|
tests[hexa] = hexadecimal
|
|||
|
|
|||
|
// Warning types.
|
|||
|
var namedNotTerminated = 1
|
|||
|
var numericNotTerminated = 2
|
|||
|
var namedEmpty = 3
|
|||
|
var numericEmpty = 4
|
|||
|
var namedUnknown = 5
|
|||
|
var numericDisallowed = 6
|
|||
|
var numericProhibited = 7
|
|||
|
|
|||
|
// Warning messages.
|
|||
|
var messages = {}
|
|||
|
|
|||
|
messages[namedNotTerminated] =
|
|||
|
'Named character references must be terminated by a semicolon'
|
|||
|
messages[numericNotTerminated] =
|
|||
|
'Numeric character references must be terminated by a semicolon'
|
|||
|
messages[namedEmpty] = 'Named character references cannot be empty'
|
|||
|
messages[numericEmpty] = 'Numeric character references cannot be empty'
|
|||
|
messages[namedUnknown] = 'Named character references must be known'
|
|||
|
messages[numericDisallowed] =
|
|||
|
'Numeric character references cannot be disallowed'
|
|||
|
messages[numericProhibited] =
|
|||
|
'Numeric character references cannot be outside the permissible Unicode range'
|
|||
|
|
|||
|
// Wrap to ensure clean parameters are given to `parse`.
|
|||
|
function parseEntities(value, options) {
|
|||
|
var settings = {}
|
|||
|
var option
|
|||
|
var key
|
|||
|
|
|||
|
if (!options) {
|
|||
|
options = {}
|
|||
|
}
|
|||
|
|
|||
|
for (key in defaults) {
|
|||
|
option = options[key]
|
|||
|
settings[key] =
|
|||
|
option === null || option === undefined ? defaults[key] : option
|
|||
|
}
|
|||
|
|
|||
|
if (settings.position.indent || settings.position.start) {
|
|||
|
settings.indent = settings.position.indent || []
|
|||
|
settings.position = settings.position.start
|
|||
|
}
|
|||
|
|
|||
|
return parse(value, settings)
|
|||
|
}
|
|||
|
|
|||
|
// Parse entities.
|
|||
|
// eslint-disable-next-line complexity
|
|||
|
function parse(value, settings) {
|
|||
|
var additional = settings.additional
|
|||
|
var nonTerminated = settings.nonTerminated
|
|||
|
var handleText = settings.text
|
|||
|
var handleReference = settings.reference
|
|||
|
var handleWarning = settings.warning
|
|||
|
var textContext = settings.textContext
|
|||
|
var referenceContext = settings.referenceContext
|
|||
|
var warningContext = settings.warningContext
|
|||
|
var pos = settings.position
|
|||
|
var indent = settings.indent || []
|
|||
|
var length = value.length
|
|||
|
var index = 0
|
|||
|
var lines = -1
|
|||
|
var column = pos.column || 1
|
|||
|
var line = pos.line || 1
|
|||
|
var queue = ''
|
|||
|
var result = []
|
|||
|
var entityCharacters
|
|||
|
var namedEntity
|
|||
|
var terminated
|
|||
|
var characters
|
|||
|
var character
|
|||
|
var reference
|
|||
|
var following
|
|||
|
var warning
|
|||
|
var reason
|
|||
|
var output
|
|||
|
var entity
|
|||
|
var begin
|
|||
|
var start
|
|||
|
var type
|
|||
|
var test
|
|||
|
var prev
|
|||
|
var next
|
|||
|
var diff
|
|||
|
var end
|
|||
|
|
|||
|
if (typeof additional === 'string') {
|
|||
|
additional = additional.charCodeAt(0)
|
|||
|
}
|
|||
|
|
|||
|
// Cache the current point.
|
|||
|
prev = now()
|
|||
|
|
|||
|
// Wrap `handleWarning`.
|
|||
|
warning = handleWarning ? parseError : noop
|
|||
|
|
|||
|
// Ensure the algorithm walks over the first character and the end
|
|||
|
// (inclusive).
|
|||
|
index--
|
|||
|
length++
|
|||
|
|
|||
|
while (++index < length) {
|
|||
|
// If the previous character was a newline.
|
|||
|
if (character === lineFeed) {
|
|||
|
column = indent[lines] || 1
|
|||
|
}
|
|||
|
|
|||
|
character = value.charCodeAt(index)
|
|||
|
|
|||
|
if (character === ampersand) {
|
|||
|
following = value.charCodeAt(index + 1)
|
|||
|
|
|||
|
// The behaviour depends on the identity of the next character.
|
|||
|
if (
|
|||
|
following === tab ||
|
|||
|
following === lineFeed ||
|
|||
|
following === formFeed ||
|
|||
|
following === space ||
|
|||
|
following === ampersand ||
|
|||
|
following === lessThan ||
|
|||
|
following !== following ||
|
|||
|
(additional && following === additional)
|
|||
|
) {
|
|||
|
// Not a character reference.
|
|||
|
// No characters are consumed, and nothing is returned.
|
|||
|
// This is not an error, either.
|
|||
|
queue += fromCharCode(character)
|
|||
|
column++
|
|||
|
|
|||
|
continue
|
|||
|
}
|
|||
|
|
|||
|
start = index + 1
|
|||
|
begin = start
|
|||
|
end = start
|
|||
|
|
|||
|
if (following === numberSign) {
|
|||
|
// Numerical entity.
|
|||
|
end = ++begin
|
|||
|
|
|||
|
// The behaviour further depends on the next character.
|
|||
|
following = value.charCodeAt(end)
|
|||
|
|
|||
|
if (following === uppercaseX || following === lowercaseX) {
|
|||
|
// ASCII hex digits.
|
|||
|
type = hexa
|
|||
|
end = ++begin
|
|||
|
} else {
|
|||
|
// ASCII digits.
|
|||
|
type = deci
|
|||
|
}
|
|||
|
} else {
|
|||
|
// Named entity.
|
|||
|
type = name
|
|||
|
}
|
|||
|
|
|||
|
entityCharacters = ''
|
|||
|
entity = ''
|
|||
|
characters = ''
|
|||
|
test = tests[type]
|
|||
|
end--
|
|||
|
|
|||
|
while (++end < length) {
|
|||
|
following = value.charCodeAt(end)
|
|||
|
|
|||
|
if (!test(following)) {
|
|||
|
break
|
|||
|
}
|
|||
|
|
|||
|
characters += fromCharCode(following)
|
|||
|
|
|||
|
// Check if we can match a legacy named reference.
|
|||
|
// If so, we cache that as the last viable named reference.
|
|||
|
// This ensures we do not need to walk backwards later.
|
|||
|
if (type === name && own.call(legacy, characters)) {
|
|||
|
entityCharacters = characters
|
|||
|
entity = legacy[characters]
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
terminated = value.charCodeAt(end) === semicolon
|
|||
|
|
|||
|
if (terminated) {
|
|||
|
end++
|
|||
|
|
|||
|
namedEntity = type === name ? decodeEntity(characters) : false
|
|||
|
|
|||
|
if (namedEntity) {
|
|||
|
entityCharacters = characters
|
|||
|
entity = namedEntity
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
diff = 1 + end - start
|
|||
|
|
|||
|
if (!terminated && !nonTerminated) {
|
|||
|
// Empty.
|
|||
|
} else if (!characters) {
|
|||
|
// An empty (possible) entity is valid, unless it’s numeric (thus an
|
|||
|
// ampersand followed by an octothorp).
|
|||
|
if (type !== name) {
|
|||
|
warning(numericEmpty, diff)
|
|||
|
}
|
|||
|
} else if (type === name) {
|
|||
|
// An ampersand followed by anything unknown, and not terminated, is
|
|||
|
// invalid.
|
|||
|
if (terminated && !entity) {
|
|||
|
warning(namedUnknown, 1)
|
|||
|
} else {
|
|||
|
// If theres something after an entity name which is not known, cap
|
|||
|
// the reference.
|
|||
|
if (entityCharacters !== characters) {
|
|||
|
end = begin + entityCharacters.length
|
|||
|
diff = 1 + end - begin
|
|||
|
terminated = false
|
|||
|
}
|
|||
|
|
|||
|
// If the reference is not terminated, warn.
|
|||
|
if (!terminated) {
|
|||
|
reason = entityCharacters ? namedNotTerminated : namedEmpty
|
|||
|
|
|||
|
if (settings.attribute) {
|
|||
|
following = value.charCodeAt(end)
|
|||
|
|
|||
|
if (following === equalsTo) {
|
|||
|
warning(reason, diff)
|
|||
|
entity = null
|
|||
|
} else if (alphanumerical(following)) {
|
|||
|
entity = null
|
|||
|
} else {
|
|||
|
warning(reason, diff)
|
|||
|
}
|
|||
|
} else {
|
|||
|
warning(reason, diff)
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
reference = entity
|
|||
|
} else {
|
|||
|
if (!terminated) {
|
|||
|
// All non-terminated numeric entities are not rendered, and trigger a
|
|||
|
// warning.
|
|||
|
warning(numericNotTerminated, diff)
|
|||
|
}
|
|||
|
|
|||
|
// When terminated and number, parse as either hexadecimal or decimal.
|
|||
|
reference = parseInt(characters, bases[type])
|
|||
|
|
|||
|
// Trigger a warning when the parsed number is prohibited, and replace
|
|||
|
// with replacement character.
|
|||
|
if (prohibited(reference)) {
|
|||
|
warning(numericProhibited, diff)
|
|||
|
reference = fromCharCode(replacementCharacter)
|
|||
|
} else if (reference in invalid) {
|
|||
|
// Trigger a warning when the parsed number is disallowed, and replace
|
|||
|
// by an alternative.
|
|||
|
warning(numericDisallowed, diff)
|
|||
|
reference = invalid[reference]
|
|||
|
} else {
|
|||
|
// Parse the number.
|
|||
|
output = ''
|
|||
|
|
|||
|
// Trigger a warning when the parsed number should not be used.
|
|||
|
if (disallowed(reference)) {
|
|||
|
warning(numericDisallowed, diff)
|
|||
|
}
|
|||
|
|
|||
|
// Stringify the number.
|
|||
|
if (reference > 0xffff) {
|
|||
|
reference -= 0x10000
|
|||
|
output += fromCharCode((reference >>> (10 & 0x3ff)) | 0xd800)
|
|||
|
reference = 0xdc00 | (reference & 0x3ff)
|
|||
|
}
|
|||
|
|
|||
|
reference = output + fromCharCode(reference)
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// Found it!
|
|||
|
// First eat the queued characters as normal text, then eat an entity.
|
|||
|
if (reference) {
|
|||
|
flush()
|
|||
|
|
|||
|
prev = now()
|
|||
|
index = end - 1
|
|||
|
column += end - start + 1
|
|||
|
result.push(reference)
|
|||
|
next = now()
|
|||
|
next.offset++
|
|||
|
|
|||
|
if (handleReference) {
|
|||
|
handleReference.call(
|
|||
|
referenceContext,
|
|||
|
reference,
|
|||
|
{start: prev, end: next},
|
|||
|
value.slice(start - 1, end)
|
|||
|
)
|
|||
|
}
|
|||
|
|
|||
|
prev = next
|
|||
|
} else {
|
|||
|
// If we could not find a reference, queue the checked characters (as
|
|||
|
// normal characters), and move the pointer to their end.
|
|||
|
// This is possible because we can be certain neither newlines nor
|
|||
|
// ampersands are included.
|
|||
|
characters = value.slice(start - 1, end)
|
|||
|
queue += characters
|
|||
|
column += characters.length
|
|||
|
index = end - 1
|
|||
|
}
|
|||
|
} else {
|
|||
|
// Handle anything other than an ampersand, including newlines and EOF.
|
|||
|
if (
|
|||
|
character === 10 // Line feed
|
|||
|
) {
|
|||
|
line++
|
|||
|
lines++
|
|||
|
column = 0
|
|||
|
}
|
|||
|
|
|||
|
if (character === character) {
|
|||
|
queue += fromCharCode(character)
|
|||
|
column++
|
|||
|
} else {
|
|||
|
flush()
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// Return the reduced nodes.
|
|||
|
return result.join('')
|
|||
|
|
|||
|
// Get current position.
|
|||
|
function now() {
|
|||
|
return {
|
|||
|
line: line,
|
|||
|
column: column,
|
|||
|
offset: index + (pos.offset || 0)
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// “Throw” a parse-error: a warning.
|
|||
|
function parseError(code, offset) {
|
|||
|
var position = now()
|
|||
|
|
|||
|
position.column += offset
|
|||
|
position.offset += offset
|
|||
|
|
|||
|
handleWarning.call(warningContext, messages[code], position, code)
|
|||
|
}
|
|||
|
|
|||
|
// Flush `queue` (normal text).
|
|||
|
// Macro invoked before each entity and at the end of `value`.
|
|||
|
// Does nothing when `queue` is empty.
|
|||
|
function flush() {
|
|||
|
if (queue) {
|
|||
|
result.push(queue)
|
|||
|
|
|||
|
if (handleText) {
|
|||
|
handleText.call(textContext, queue, {start: prev, end: now()})
|
|||
|
}
|
|||
|
|
|||
|
queue = ''
|
|||
|
}
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// Check if `character` is outside the permissible unicode range.
|
|||
|
function prohibited(code) {
|
|||
|
return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff
|
|||
|
}
|
|||
|
|
|||
|
// Check if `character` is disallowed.
|
|||
|
function disallowed(code) {
|
|||
|
return (
|
|||
|
(code >= 0x0001 && code <= 0x0008) ||
|
|||
|
code === 0x000b ||
|
|||
|
(code >= 0x000d && code <= 0x001f) ||
|
|||
|
(code >= 0x007f && code <= 0x009f) ||
|
|||
|
(code >= 0xfdd0 && code <= 0xfdef) ||
|
|||
|
(code & 0xffff) === 0xffff ||
|
|||
|
(code & 0xffff) === 0xfffe
|
|||
|
)
|
|||
|
}
|