diff --git a/core/utils/xml.ts b/core/utils/xml.ts
index a02f1a8d6..a9019c0be 100644
--- a/core/utils/xml.ts
+++ b/core/utils/xml.ts
@@ -10,6 +10,22 @@ goog.declareModuleId('Blockly.utils.xml');
import * as deprecation from './deprecation.js';
+let domParser: DOMParser = {
+ parseFromString: function() {
+ throw new Error(
+ 'DOMParser was not found in the global scope and was not properly ' +
+ 'injected using injectDependencies');
+ },
+};
+
+let xmlSerializer: XMLSerializer = {
+ serializeToString: function() {
+ throw new Error(
+ 'XMLSerializer was not foundin the global scope and was not properly ' +
+ 'injected using injectDependencies');
+ },
+};
+
/**
* Injected dependencies. By default these are just (and have the
* same types as) the corresponding DOM Window properties, but the
@@ -18,6 +34,8 @@ import * as deprecation from './deprecation.js';
* package instead.
*/
let {document, DOMParser, XMLSerializer} = globalThis;
+if (DOMParser) domParser = new DOMParser();
+if (XMLSerializer) xmlSerializer = new XMLSerializer();
/**
* Inject implementations of document, DOMParser and/or XMLSerializer
@@ -50,6 +68,9 @@ export function injectDependencies(dependencies: {
DOMParser = DOMParser,
XMLSerializer = XMLSerializer,
} = dependencies);
+
+ domParser = new DOMParser();
+ xmlSerializer = new XMLSerializer();
}
/**
@@ -57,6 +78,9 @@ export function injectDependencies(dependencies: {
*/
export const NAME_SPACE = 'https://developers.google.com/blockly/xml';
+// eslint-disable-next-line no-control-regex
+const INVALID_CONTROL_CHARS = /[\x00-\x09\x0B\x0C\x0E-\x1F]/g;
+
/**
* Get the document object to use for XML serialization.
*
@@ -102,18 +126,35 @@ export function createTextNode(text: string): Text {
/**
* Converts an XML string into a DOM structure.
*
+ * Control characters should be escaped. (But we will try to best-effort parse
+ * unescaped characters.)
+ *
+ * Note that even when escaped, U+0000 will be parsed as U+FFFD (the
+ * "replacement character") because U+0000 is never a valid XML character
+ * (even in XML 1.1).
+ * https://www.w3.org/TR/xml11/#charsets
+ *
* @param text An XML string.
* @returns A DOM object representing the singular child of the document
* element.
* @throws if the text doesn't parse.
*/
export function textToDom(text: string): Element {
- const doc = textToDomDocument(text);
- if (!doc || !doc.documentElement ||
- doc.getElementsByTagName('parsererror').length) {
- throw Error('textToDom was unable to parse: ' + text);
+ let doc = domParser.parseFromString(text, 'text/xml');
+ if (doc && doc.documentElement &&
+ !doc.getElementsByTagName('parsererror').length) {
+ return doc.documentElement;
}
- return doc.documentElement;
+
+ // Attempt to parse as HTML to deserialize control characters that were
+ // serialized before the serializer did proper escaping.
+ doc = domParser.parseFromString(text, 'text/html');
+ if (doc && doc.body.firstChild &&
+ doc.body.firstChild.nodeName.toLowerCase() === 'xml') {
+ return doc.body.firstChild as Element;
+ }
+
+ throw new Error(`DOMParser was unable to parse: ${text}`);
}
/**
@@ -124,18 +165,30 @@ export function textToDom(text: string): Element {
* @throws if XML doesn't parse.
*/
export function textToDomDocument(text: string): Document {
- const oParser = new DOMParser();
- return oParser.parseFromString(text, 'text/xml');
+ deprecation.warn(
+ 'Blockly.utils.xml.textToDomDocument', 'version 10', 'version 11');
+ return domParser.parseFromString(text, 'text/xml');
}
/**
* Converts a DOM structure into plain text.
* Currently the text format is fairly ugly: all one line with no whitespace.
*
+ * Control characters are escaped using their decimal encodings. This includes
+ * U+0000 even though it is technically never a valid XML character (even in
+ * XML 1.1).
+ * https://www.w3.org/TR/xml11/#charsets
+ *
+ * When decoded U+0000 will be parsed as U+FFFD (the "replacement character").
+ *
* @param dom A tree of XML nodes.
* @returns Text representation.
*/
export function domToText(dom: Node): string {
- const oSerializer = new XMLSerializer();
- return oSerializer.serializeToString(dom);
+ return sanitizeText(xmlSerializer.serializeToString(dom));
+}
+
+function sanitizeText(text: string) {
+ return text.replace(
+ INVALID_CONTROL_CHARS, (match) => `${match.charCodeAt(0)};`);
}
diff --git a/tests/mocha/serializer_test.js b/tests/mocha/serializer_test.js
index d45cd40c3..02a5fc410 100644
--- a/tests/mocha/serializer_test.js
+++ b/tests/mocha/serializer_test.js
@@ -370,7 +370,7 @@ Serializer.Fields.LabelSerializable.ControlChars = new SerializerTestCase(
'ControlChars',
'' +
'' +
- '¡' +
+ 'a1;' +
'' +
'');
Serializer.Fields.LabelSerializable.testCases = [
@@ -416,7 +416,7 @@ Serializer.Fields.MultilineInput.Tabs = new SerializerTestCase(
'' +
'' +
'' +
- 'line1 line2 line3' +
+ 'line1 	line2 	line3' +
'' +
'' +
'');
@@ -487,7 +487,7 @@ Serializer.Fields.MultilineInput.ControlChars = new SerializerTestCase(
'ControlChars',
'' +
'' +
- '¡' +
+ 'a1;' +
'' +
'');
Serializer.Fields.MultilineInput.testCases = [
@@ -588,7 +588,7 @@ Serializer.Fields.TextInput.Simple = new SerializerTestCase('Simple',
Serializer.Fields.TextInput.Tabs = new SerializerTestCase('Tabs',
'' +
'' +
- 'line1 line2 line3' +
+ 'line1	line2	line3' +
'' +
'');
/* eslint-enable no-tabs */
@@ -658,7 +658,7 @@ Serializer.Fields.TextInput.ControlChars = new SerializerTestCase(
'ControlChars',
'' +
'' +
- '¡' +
+ 'a1;' +
'' +
'');
Serializer.Fields.TextInput.testCases = [
@@ -708,10 +708,10 @@ Serializer.Fields.Variable.Types = new SerializerTestCase('Types',
Serializer.Fields.Variable.Tabs = new SerializerTestCase('Tabs',
'' +
'' +
- 'line1 line2 line3' +
+ 'line1	line2	line3' +
'' +
'' +
- 'line1 line2 line3' +
+ 'line1	line2	line3' +
'' +
'');
/* eslint-enable no-tabs */
@@ -808,10 +808,10 @@ Serializer.Fields.Variable.ControlChars = new SerializerTestCase(
'ControlChars',
'' +
'' +
- '¡' +
+ 'a1;' +
'' +
'' +
- '¡' +
+ 'a1;' +
'' +
'');
Serializer.Fields.Variable.testCases = [
@@ -1047,7 +1047,7 @@ Serializer.Icons.Comment.Text.ControlChars = new SerializerTestCase(
'ControlChars',
'' +
'' +
- '¡' +
+ 'a1;' +
'' +
'');
Serializer.Icons.Comment.Text.testCases = [
@@ -1804,7 +1804,7 @@ Serializer.Mutations.Procedure.Names.ControlChars = new SerializerTestCase(
'ControlChars',
'' +
'' +
- '¡' +
+ 'a1;' +
'' +
'');
Serializer.Mutations.Procedure.Names.testCases = [
diff --git a/tests/mocha/xml_test.js b/tests/mocha/xml_test.js
index df794bd8a..222eb0393 100644
--- a/tests/mocha/xml_test.js
+++ b/tests/mocha/xml_test.js
@@ -33,6 +33,9 @@ suite('XML', function() {
chai.assert.equal(fieldDom.getAttribute('id'), id);
chai.assert.equal(fieldDom.textContent, text);
};
+ const assertXmlDoc = function(doc) {
+ chai.assert.equal(doc.nodeName.toLowerCase(), 'xml', 'XML tag');
+ };
setup(function() {
sharedTestSetup.call(this);
Blockly.defineBlocksWithJsonArray([
@@ -73,13 +76,40 @@ suite('XML', function() {
teardown(function() {
sharedTestTeardown.call(this);
});
+
suite('textToDom', function() {
test('Basic', function() {
const dom = Blockly.utils.xml.textToDom(this.complexXmlText);
- chai.assert.equal(dom.nodeName, 'xml', 'XML tag');
- chai.assert.equal(dom.getElementsByTagName('block').length, 6, 'Block tags');
+ assertXmlDoc(dom);
+ chai.assert.equal(
+ dom.getElementsByTagName('block').length, 6, 'Block tags');
+ });
+
+ test(
+ 'text with hex-encoded NCR Control characters are properly ' +
+ 'deserialized',
+ function() {
+ const dom = Blockly.utils.xml.textToDom(' ');
+ assertXmlDoc(dom);
+ chai.assert.equal(dom.firstChild.textContent, '\u0001\t\u001f');
+ });
+
+ test(
+ 'text with dec-encoded NCR Control characters are properly ' +
+ 'deserialized',
+ function() {
+ const dom = Blockly.utils.xml.textToDom(' ');
+ assertXmlDoc(dom);
+ chai.assert.equal(dom.firstChild.textContent, '\u0001\u0009\u001f');
+ });
+
+ test('text with an escaped ampersand is properly deserialized', function() {
+ const dom = Blockly.utils.xml.textToDom('&');
+ assertXmlDoc(dom);
+ chai.assert.equal(dom.firstChild.textContent, '&');
});
});
+
suite('blockToDom', function() {
setup(function() {
this.workspace = new Blockly.Workspace();
@@ -433,6 +463,7 @@ suite('XML', function() {
chai.assert.equal(resultDom.children.length, 0);
});
});
+
suite('domToText', function() {
test('Round tripping', function() {
const dom = Blockly.utils.xml.textToDom(this.complexXmlText);
@@ -440,7 +471,26 @@ suite('XML', function() {
chai.assert.equal(text.replace(/\s+/g, ''),
this.complexXmlText.replace(/\s+/g, ''), 'Round trip');
});
+
+ test('control characters are escaped', function() {
+ const dom = Blockly.utils.xml.createElement('xml');
+ dom.appendChild(Blockly.utils.xml.createTextNode('')); // u0001
+ chai.assert.equal(
+ Blockly.utils.xml.domToText(dom),
+ ''
+ );
+ });
+
+ test('ampersands are escaped', function() {
+ const dom = Blockly.utils.xml.createElement('xml');
+ dom.appendChild(Blockly.utils.xml.createTextNode('&'));
+ chai.assert.equal(
+ Blockly.Xml.domToText(dom),
+ '&'
+ );
+ });
});
+
suite('domToPrettyText', function() {
test('Round tripping', function() {
const dom = Blockly.utils.xml.textToDom(this.complexXmlText);