diff --git a/core/utils/xml.ts b/core/utils/xml.ts index a02f1a8d6..a9019c0be 100644 --- a/core/utils/xml.ts +++ b/core/utils/xml.ts @@ -10,6 +10,22 @@ goog.declareModuleId('Blockly.utils.xml'); import * as deprecation from './deprecation.js'; +let domParser: DOMParser = { + parseFromString: function() { + throw new Error( + 'DOMParser was not found in the global scope and was not properly ' + + 'injected using injectDependencies'); + }, +}; + +let xmlSerializer: XMLSerializer = { + serializeToString: function() { + throw new Error( + 'XMLSerializer was not foundin the global scope and was not properly ' + + 'injected using injectDependencies'); + }, +}; + /** * Injected dependencies. By default these are just (and have the * same types as) the corresponding DOM Window properties, but the @@ -18,6 +34,8 @@ import * as deprecation from './deprecation.js'; * package instead. */ let {document, DOMParser, XMLSerializer} = globalThis; +if (DOMParser) domParser = new DOMParser(); +if (XMLSerializer) xmlSerializer = new XMLSerializer(); /** * Inject implementations of document, DOMParser and/or XMLSerializer @@ -50,6 +68,9 @@ export function injectDependencies(dependencies: { DOMParser = DOMParser, XMLSerializer = XMLSerializer, } = dependencies); + + domParser = new DOMParser(); + xmlSerializer = new XMLSerializer(); } /** @@ -57,6 +78,9 @@ export function injectDependencies(dependencies: { */ export const NAME_SPACE = 'https://developers.google.com/blockly/xml'; +// eslint-disable-next-line no-control-regex +const INVALID_CONTROL_CHARS = /[\x00-\x09\x0B\x0C\x0E-\x1F]/g; + /** * Get the document object to use for XML serialization. * @@ -102,18 +126,35 @@ export function createTextNode(text: string): Text { /** * Converts an XML string into a DOM structure. * + * Control characters should be escaped. (But we will try to best-effort parse + * unescaped characters.) + * + * Note that even when escaped, U+0000 will be parsed as U+FFFD (the + * "replacement character") because U+0000 is never a valid XML character + * (even in XML 1.1). + * https://www.w3.org/TR/xml11/#charsets + * * @param text An XML string. * @returns A DOM object representing the singular child of the document * element. * @throws if the text doesn't parse. */ export function textToDom(text: string): Element { - const doc = textToDomDocument(text); - if (!doc || !doc.documentElement || - doc.getElementsByTagName('parsererror').length) { - throw Error('textToDom was unable to parse: ' + text); + let doc = domParser.parseFromString(text, 'text/xml'); + if (doc && doc.documentElement && + !doc.getElementsByTagName('parsererror').length) { + return doc.documentElement; } - return doc.documentElement; + + // Attempt to parse as HTML to deserialize control characters that were + // serialized before the serializer did proper escaping. + doc = domParser.parseFromString(text, 'text/html'); + if (doc && doc.body.firstChild && + doc.body.firstChild.nodeName.toLowerCase() === 'xml') { + return doc.body.firstChild as Element; + } + + throw new Error(`DOMParser was unable to parse: ${text}`); } /** @@ -124,18 +165,30 @@ export function textToDom(text: string): Element { * @throws if XML doesn't parse. */ export function textToDomDocument(text: string): Document { - const oParser = new DOMParser(); - return oParser.parseFromString(text, 'text/xml'); + deprecation.warn( + 'Blockly.utils.xml.textToDomDocument', 'version 10', 'version 11'); + return domParser.parseFromString(text, 'text/xml'); } /** * Converts a DOM structure into plain text. * Currently the text format is fairly ugly: all one line with no whitespace. * + * Control characters are escaped using their decimal encodings. This includes + * U+0000 even though it is technically never a valid XML character (even in + * XML 1.1). + * https://www.w3.org/TR/xml11/#charsets + * + * When decoded U+0000 will be parsed as U+FFFD (the "replacement character"). + * * @param dom A tree of XML nodes. * @returns Text representation. */ export function domToText(dom: Node): string { - const oSerializer = new XMLSerializer(); - return oSerializer.serializeToString(dom); + return sanitizeText(xmlSerializer.serializeToString(dom)); +} + +function sanitizeText(text: string) { + return text.replace( + INVALID_CONTROL_CHARS, (match) => `&#${match.charCodeAt(0)};`); } diff --git a/tests/mocha/serializer_test.js b/tests/mocha/serializer_test.js index d45cd40c3..02a5fc410 100644 --- a/tests/mocha/serializer_test.js +++ b/tests/mocha/serializer_test.js @@ -370,7 +370,7 @@ Serializer.Fields.LabelSerializable.ControlChars = new SerializerTestCase( 'ControlChars', '' + '' + - '' + + '&#a1;' + '' + ''); Serializer.Fields.LabelSerializable.testCases = [ @@ -416,7 +416,7 @@ Serializer.Fields.MultilineInput.Tabs = new SerializerTestCase( '' + '' + '' + - 'line1
 line2
 line3' + + 'line1
&#x9line2
&#x9line3' + '' + '' + ''); @@ -487,7 +487,7 @@ Serializer.Fields.MultilineInput.ControlChars = new SerializerTestCase( 'ControlChars', '' + '' + - '' + + '&#a1;' + '' + ''); Serializer.Fields.MultilineInput.testCases = [ @@ -588,7 +588,7 @@ Serializer.Fields.TextInput.Simple = new SerializerTestCase('Simple', Serializer.Fields.TextInput.Tabs = new SerializerTestCase('Tabs', '' + '' + - 'line1 line2 line3' + + 'line1&#x9line2&#x9line3' + '' + ''); /* eslint-enable no-tabs */ @@ -658,7 +658,7 @@ Serializer.Fields.TextInput.ControlChars = new SerializerTestCase( 'ControlChars', '' + '' + - '' + + '&#a1;' + '' + ''); Serializer.Fields.TextInput.testCases = [ @@ -708,10 +708,10 @@ Serializer.Fields.Variable.Types = new SerializerTestCase('Types', Serializer.Fields.Variable.Tabs = new SerializerTestCase('Tabs', '' + '' + - 'line1 line2 line3' + + 'line1&#x9line2&#x9line3' + '' + '' + - 'line1 line2 line3' + + 'line1&#x9line2&#x9line3' + '' + ''); /* eslint-enable no-tabs */ @@ -808,10 +808,10 @@ Serializer.Fields.Variable.ControlChars = new SerializerTestCase( 'ControlChars', '' + '' + - '' + + '&#a1;' + '' + '' + - '' + + '&#a1;' + '' + ''); Serializer.Fields.Variable.testCases = [ @@ -1047,7 +1047,7 @@ Serializer.Icons.Comment.Text.ControlChars = new SerializerTestCase( 'ControlChars', '' + '' + - '' + + '&#a1;' + '' + ''); Serializer.Icons.Comment.Text.testCases = [ @@ -1804,7 +1804,7 @@ Serializer.Mutations.Procedure.Names.ControlChars = new SerializerTestCase( 'ControlChars', '' + '' + - '' + + '&#a1;' + '' + ''); Serializer.Mutations.Procedure.Names.testCases = [ diff --git a/tests/mocha/xml_test.js b/tests/mocha/xml_test.js index df794bd8a..222eb0393 100644 --- a/tests/mocha/xml_test.js +++ b/tests/mocha/xml_test.js @@ -33,6 +33,9 @@ suite('XML', function() { chai.assert.equal(fieldDom.getAttribute('id'), id); chai.assert.equal(fieldDom.textContent, text); }; + const assertXmlDoc = function(doc) { + chai.assert.equal(doc.nodeName.toLowerCase(), 'xml', 'XML tag'); + }; setup(function() { sharedTestSetup.call(this); Blockly.defineBlocksWithJsonArray([ @@ -73,13 +76,40 @@ suite('XML', function() { teardown(function() { sharedTestTeardown.call(this); }); + suite('textToDom', function() { test('Basic', function() { const dom = Blockly.utils.xml.textToDom(this.complexXmlText); - chai.assert.equal(dom.nodeName, 'xml', 'XML tag'); - chai.assert.equal(dom.getElementsByTagName('block').length, 6, 'Block tags'); + assertXmlDoc(dom); + chai.assert.equal( + dom.getElementsByTagName('block').length, 6, 'Block tags'); + }); + + test( + 'text with hex-encoded NCR Control characters are properly ' + + 'deserialized', + function() { + const dom = Blockly.utils.xml.textToDom(' '); + assertXmlDoc(dom); + chai.assert.equal(dom.firstChild.textContent, '\u0001\t\u001f'); + }); + + test( + 'text with dec-encoded NCR Control characters are properly ' + + 'deserialized', + function() { + const dom = Blockly.utils.xml.textToDom(' '); + assertXmlDoc(dom); + chai.assert.equal(dom.firstChild.textContent, '\u0001\u0009\u001f'); + }); + + test('text with an escaped ampersand is properly deserialized', function() { + const dom = Blockly.utils.xml.textToDom('&'); + assertXmlDoc(dom); + chai.assert.equal(dom.firstChild.textContent, '&'); }); }); + suite('blockToDom', function() { setup(function() { this.workspace = new Blockly.Workspace(); @@ -433,6 +463,7 @@ suite('XML', function() { chai.assert.equal(resultDom.children.length, 0); }); }); + suite('domToText', function() { test('Round tripping', function() { const dom = Blockly.utils.xml.textToDom(this.complexXmlText); @@ -440,7 +471,26 @@ suite('XML', function() { chai.assert.equal(text.replace(/\s+/g, ''), this.complexXmlText.replace(/\s+/g, ''), 'Round trip'); }); + + test('control characters are escaped', function() { + const dom = Blockly.utils.xml.createElement('xml'); + dom.appendChild(Blockly.utils.xml.createTextNode('')); // u0001 + chai.assert.equal( + Blockly.utils.xml.domToText(dom), + '' + ); + }); + + test('ampersands are escaped', function() { + const dom = Blockly.utils.xml.createElement('xml'); + dom.appendChild(Blockly.utils.xml.createTextNode('&')); + chai.assert.equal( + Blockly.Xml.domToText(dom), + '&' + ); + }); }); + suite('domToPrettyText', function() { test('Round tripping', function() { const dom = Blockly.utils.xml.textToDom(this.complexXmlText);