fix: non-printable characters in XML (#6952)

* fix: non-printable characters in XMl

* fix: PR comments

* chore: format

* chore: move to module-level parser and serializer

* chore: reorganize textToDom

* chore: add dummy implementations of domParser and xmlSerializer

* chore: properly check classes before constructing

* chore: fix tests

* chore: PR comments

* chore: remove null char from tests

* chore: docs!
This commit is contained in:
Beka Westberg
2023-04-17 16:05:09 -07:00
committed by GitHub
parent 20f0c937cd
commit edc5843c4c
3 changed files with 125 additions and 22 deletions

View File

@@ -10,6 +10,22 @@ goog.declareModuleId('Blockly.utils.xml');
import * as deprecation from './deprecation.js';
let domParser: DOMParser = {
parseFromString: function() {
throw new Error(
'DOMParser was not found in the global scope and was not properly ' +
'injected using injectDependencies');
},
};
let xmlSerializer: XMLSerializer = {
serializeToString: function() {
throw new Error(
'XMLSerializer was not foundin the global scope and was not properly ' +
'injected using injectDependencies');
},
};
/**
* Injected dependencies. By default these are just (and have the
* same types as) the corresponding DOM Window properties, but the
@@ -18,6 +34,8 @@ import * as deprecation from './deprecation.js';
* package instead.
*/
let {document, DOMParser, XMLSerializer} = globalThis;
if (DOMParser) domParser = new DOMParser();
if (XMLSerializer) xmlSerializer = new XMLSerializer();
/**
* Inject implementations of document, DOMParser and/or XMLSerializer
@@ -50,6 +68,9 @@ export function injectDependencies(dependencies: {
DOMParser = DOMParser,
XMLSerializer = XMLSerializer,
} = dependencies);
domParser = new DOMParser();
xmlSerializer = new XMLSerializer();
}
/**
@@ -57,6 +78,9 @@ export function injectDependencies(dependencies: {
*/
export const NAME_SPACE = 'https://developers.google.com/blockly/xml';
// eslint-disable-next-line no-control-regex
const INVALID_CONTROL_CHARS = /[\x00-\x09\x0B\x0C\x0E-\x1F]/g;
/**
* Get the document object to use for XML serialization.
*
@@ -102,18 +126,35 @@ export function createTextNode(text: string): Text {
/**
* Converts an XML string into a DOM structure.
*
* Control characters should be escaped. (But we will try to best-effort parse
* unescaped characters.)
*
* Note that even when escaped, U+0000 will be parsed as U+FFFD (the
* "replacement character") because U+0000 is never a valid XML character
* (even in XML 1.1).
* https://www.w3.org/TR/xml11/#charsets
*
* @param text An XML string.
* @returns A DOM object representing the singular child of the document
* element.
* @throws if the text doesn't parse.
*/
export function textToDom(text: string): Element {
const doc = textToDomDocument(text);
if (!doc || !doc.documentElement ||
doc.getElementsByTagName('parsererror').length) {
throw Error('textToDom was unable to parse: ' + text);
let doc = domParser.parseFromString(text, 'text/xml');
if (doc && doc.documentElement &&
!doc.getElementsByTagName('parsererror').length) {
return doc.documentElement;
}
return doc.documentElement;
// Attempt to parse as HTML to deserialize control characters that were
// serialized before the serializer did proper escaping.
doc = domParser.parseFromString(text, 'text/html');
if (doc && doc.body.firstChild &&
doc.body.firstChild.nodeName.toLowerCase() === 'xml') {
return doc.body.firstChild as Element;
}
throw new Error(`DOMParser was unable to parse: ${text}`);
}
/**
@@ -124,18 +165,30 @@ export function textToDom(text: string): Element {
* @throws if XML doesn't parse.
*/
export function textToDomDocument(text: string): Document {
const oParser = new DOMParser();
return oParser.parseFromString(text, 'text/xml');
deprecation.warn(
'Blockly.utils.xml.textToDomDocument', 'version 10', 'version 11');
return domParser.parseFromString(text, 'text/xml');
}
/**
* Converts a DOM structure into plain text.
* Currently the text format is fairly ugly: all one line with no whitespace.
*
* Control characters are escaped using their decimal encodings. This includes
* U+0000 even though it is technically never a valid XML character (even in
* XML 1.1).
* https://www.w3.org/TR/xml11/#charsets
*
* When decoded U+0000 will be parsed as U+FFFD (the "replacement character").
*
* @param dom A tree of XML nodes.
* @returns Text representation.
*/
export function domToText(dom: Node): string {
const oSerializer = new XMLSerializer();
return oSerializer.serializeToString(dom);
return sanitizeText(xmlSerializer.serializeToString(dom));
}
function sanitizeText(text: string) {
return text.replace(
INVALID_CONTROL_CHARS, (match) => `&#${match.charCodeAt(0)};`);
}