diff --git a/CHANGES.rst b/CHANGES.rst
index ac08ae1f..2f691352 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -36,6 +36,10 @@ Released on May 17, 2013
longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
return the default DOM treebuilder, which uses ``xml.dom.minidom``.
+* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
+ ``treeadapters.sax.to_sax`` which is generic and supports any
+ treewalker; it also resolves all known bugs with ``dom2sax``.
+
* Optional heuristic character encoding detection now based on
``charade`` for Python 2.6 - 3.3 compatibility.
diff --git a/html5lib/constants.py b/html5lib/constants.py
index 1866dd78..e7089846 100644
--- a/html5lib/constants.py
+++ b/html5lib/constants.py
@@ -433,6 +433,24 @@
(namespaces["mathml"], "mtext")
))
+adjustForeignAttributes = {
+ "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
+ "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
+ "xlink:href": ("xlink", "href", namespaces["xlink"]),
+ "xlink:role": ("xlink", "role", namespaces["xlink"]),
+ "xlink:show": ("xlink", "show", namespaces["xlink"]),
+ "xlink:title": ("xlink", "title", namespaces["xlink"]),
+ "xlink:type": ("xlink", "type", namespaces["xlink"]),
+ "xml:base": ("xml", "base", namespaces["xml"]),
+ "xml:lang": ("xml", "lang", namespaces["xml"]),
+ "xml:space": ("xml", "space", namespaces["xml"]),
+ "xmlns": (None, "xmlns", namespaces["xmlns"]),
+ "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
+}
+
+unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
+ adjustForeignAttributes.items()])
+
spaceCharacters = frozenset((
"\t",
"\n",
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
index 22c2b75c..0518c410 100644
--- a/html5lib/html5parser.py
+++ b/html5lib/html5parser.py
@@ -17,6 +17,7 @@
from .constants import cdataElements, rcdataElements
from .constants import tokenTypes, ReparseException, namespaces
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
+from .constants import adjustForeignAttributes as adjustForeignAttributesMap
def parse(doc, treebuilder="etree", encoding=None,
@@ -333,20 +334,7 @@ def adjustSVGAttributes(self, token):
del token["data"][originalName]
def adjustForeignAttributes(self, token):
- replacements = {
- "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
- "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
- "xlink:href": ("xlink", "href", namespaces["xlink"]),
- "xlink:role": ("xlink", "role", namespaces["xlink"]),
- "xlink:show": ("xlink", "show", namespaces["xlink"]),
- "xlink:title": ("xlink", "title", namespaces["xlink"]),
- "xlink:type": ("xlink", "type", namespaces["xlink"]),
- "xml:base": ("xml", "base", namespaces["xml"]),
- "xml:lang": ("xml", "lang", namespaces["xml"]),
- "xml:space": ("xml", "space", namespaces["xml"]),
- "xmlns": (None, "xmlns", namespaces["xmlns"]),
- "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
- }
+ replacements = adjustForeignAttributesMap
for originalName in token["data"].keys():
if originalName in replacements:
diff --git a/html5lib/tests/support.py b/html5lib/tests/support.py
index ebe90c2c..41f2d2a0 100644
--- a/html5lib/tests/support.py
+++ b/html5lib/tests/support.py
@@ -4,6 +4,7 @@
import sys
import codecs
import glob
+import xml.sax.handler
base_path = os.path.split(__file__)[0]
@@ -130,3 +131,47 @@ def errorMessage(input, expected, actual):
if sys.version_info.major == 2:
msg = msg.encode("ascii", "backslashreplace")
return msg
+
+
+class TracingSaxHandler(xml.sax.handler.ContentHandler):
+ def __init__(self):
+ xml.sax.handler.ContentHandler.__init__(self)
+ self.visited = []
+
+ def startDocument(self):
+ self.visited.append('startDocument')
+
+ def endDocument(self):
+ self.visited.append('endDocument')
+
+ def startPrefixMapping(self, prefix, uri):
+ # These are ignored as their order is not guaranteed
+ pass
+
+ def endPrefixMapping(self, prefix):
+ # These are ignored as their order is not guaranteed
+ pass
+
+ def startElement(self, name, attrs):
+ self.visited.append(('startElement', name, attrs))
+
+ def endElement(self, name):
+ self.visited.append(('endElement', name))
+
+ def startElementNS(self, name, qname, attrs):
+ self.visited.append(('startElementNS', name, qname, dict(attrs)))
+
+ def endElementNS(self, name, qname):
+ self.visited.append(('endElementNS', name, qname))
+
+ def characters(self, content):
+ self.visited.append(('characters', content))
+
+ def ignorableWhitespace(self, whitespace):
+ self.visited.append(('ignorableWhitespace', whitespace))
+
+ def processingInstruction(self, target, data):
+ self.visited.append(('processingInstruction', target, data))
+
+ def skippedEntity(self, name):
+ self.visited.append(('skippedEntity', name))
diff --git a/html5lib/tests/test_treeadapters.py b/html5lib/tests/test_treeadapters.py
new file mode 100644
index 00000000..5f38b6c3
--- /dev/null
+++ b/html5lib/tests/test_treeadapters.py
@@ -0,0 +1,40 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from . import support # flake8: noqa
+
+import html5lib
+from html5lib.treeadapters import sax
+from html5lib.treewalkers import getTreeWalker
+
+
+def test_to_sax():
+ handler = support.TracingSaxHandler()
+ tree = html5lib.parse("""
+
Directory Listing
+
+ """, treebuilder="etree")
+ walker = getTreeWalker("etree")
+ sax.to_sax(walker(tree), handler)
+ expected = [
+ 'startDocument',
+ ('startElementNS', ('http://www.w3.org/1999/xhtml', 'html'),
+ 'html', {(None, 'xml:lang'): 'en'}),
+ ('startElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head', {}),
+ ('startElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title', {}),
+ ('characters', 'Directory Listing'),
+ ('endElementNS', ('http://www.w3.org/1999/xhtml', 'title'), 'title'),
+ ('characters', '\n '),
+ ('endElementNS', ('http://www.w3.org/1999/xhtml', 'head'), 'head'),
+ ('startElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body', {}),
+ ('startElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a', {(None, 'href'): '/'}),
+ ('startElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b', {}),
+ ('startElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p', {}),
+ ('endElementNS', ('http://www.w3.org/1999/xhtml', 'p'), 'p'),
+ ('characters', '\n '),
+ ('endElementNS', ('http://www.w3.org/1999/xhtml', 'b'), 'b'),
+ ('endElementNS', ('http://www.w3.org/1999/xhtml', 'a'), 'a'),
+ ('endElementNS', ('http://www.w3.org/1999/xhtml', 'body'), 'body'),
+ ('endElementNS', ('http://www.w3.org/1999/xhtml', 'html'), 'html'),
+ 'endDocument',
+ ]
+ assert expected == handler.visited
diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/html5lib/treeadapters/sax.py b/html5lib/treeadapters/sax.py
new file mode 100644
index 00000000..ad47df95
--- /dev/null
+++ b/html5lib/treeadapters/sax.py
@@ -0,0 +1,44 @@
+from __future__ import absolute_import, division, unicode_literals
+
+from xml.sax.xmlreader import AttributesNSImpl
+
+from ..constants import adjustForeignAttributes, unadjustForeignAttributes
+
+prefix_mapping = {}
+for prefix, localName, namespace in adjustForeignAttributes.values():
+ if prefix is not None:
+ prefix_mapping[prefix] = namespace
+
+
+def to_sax(walker, handler):
+ """Call SAX-like content handler based on treewalker walker"""
+ handler.startDocument()
+ for prefix, namespace in prefix_mapping.items():
+ handler.startPrefixMapping(prefix, namespace)
+
+ for token in walker:
+ type = token["type"]
+ if type == "Doctype":
+ continue
+ elif type in ("StartTag", "EmptyTag"):
+ attrs = AttributesNSImpl(token["data"],
+ unadjustForeignAttributes)
+ handler.startElementNS((token["namespace"], token["name"]),
+ token["name"],
+ attrs)
+ if type == "EmptyTag":
+ handler.endElementNS((token["namespace"], token["name"]),
+ token["name"])
+ elif type == "EndTag":
+ handler.endElementNS((token["namespace"], token["name"]),
+ token["name"])
+ elif type in ("Characters", "SpaceCharacters"):
+ handler.characters(token["data"])
+ elif type == "Comment":
+ pass
+ else:
+ assert False, "Unknown token type"
+
+ for prefix, namespace in prefix_mapping.items():
+ handler.endPrefixMapping(prefix)
+ handler.endDocument()
diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py
index f9e0d76e..61e5ed79 100644
--- a/html5lib/treebuilders/dom.py
+++ b/html5lib/treebuilders/dom.py
@@ -1,7 +1,7 @@
from __future__ import absolute_import, division, unicode_literals
-from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
+from xml.dom import minidom, Node
import weakref
from . import _base
@@ -220,69 +220,6 @@ def serializeElement(element, indent=0):
return "\n".join(rv)
- def dom2sax(node, handler, nsmap={'xml': XML_NAMESPACE}):
- if node.nodeType == Node.ELEMENT_NODE:
- if not nsmap:
- handler.startElement(node.nodeName, node.attributes)
- for child in node.childNodes:
- dom2sax(child, handler, nsmap)
- handler.endElement(node.nodeName)
- else:
- attributes = dict(node.attributes.itemsNS())
-
- # gather namespace declarations
- prefixes = []
- for attrname in list(node.attributes.keys()):
- attr = node.getAttributeNode(attrname)
- if (attr.namespaceURI == XMLNS_NAMESPACE or
- (attr.namespaceURI is None and attr.nodeName.startswith('xmlns'))):
- prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
- handler.startPrefixMapping(prefix, attr.nodeValue)
- prefixes.append(prefix)
- nsmap = nsmap.copy()
- nsmap[prefix] = attr.nodeValue
- del attributes[(attr.namespaceURI, attr.nodeName)]
-
- # apply namespace declarations
- for attrname in list(node.attributes.keys()):
- attr = node.getAttributeNode(attrname)
- if attr.namespaceURI is None and ':' in attr.nodeName:
- prefix = attr.nodeName.split(':')[0]
- if prefix in nsmap:
- del attributes[(attr.namespaceURI, attr.nodeName)]
- attributes[(nsmap[prefix], attr.nodeName)] = attr.nodeValue
-
- # SAX events
- ns = node.namespaceURI or nsmap.get(None, None)
- handler.startElementNS((ns, node.nodeName), node.nodeName, attributes)
- for child in node.childNodes:
- dom2sax(child, handler, nsmap)
- handler.endElementNS((ns, node.nodeName), node.nodeName)
- for prefix in prefixes:
- handler.endPrefixMapping(prefix)
-
- elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
- handler.characters(node.nodeValue)
-
- elif node.nodeType == Node.DOCUMENT_NODE:
- handler.startDocument()
- for child in node.childNodes:
- dom2sax(child, handler, nsmap)
- handler.endDocument()
-
- elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
- for child in node.childNodes:
- dom2sax(child, handler, nsmap)
-
- else:
- # ATTRIBUTE_NODE
- # ENTITY_NODE
- # PROCESSING_INSTRUCTION_NODE
- # COMMENT_NODE
- # DOCUMENT_TYPE_NODE
- # NOTATION_NODE
- pass
-
return locals()