From c8e024b8ae8712baa79872ac0aed6d0b82e06fa2 Mon Sep 17 00:00:00 2001 From: Marc DM Date: Sun, 16 Jun 2013 18:15:56 -0500 Subject: [PATCH 1/6] fixed the unicode/str/six bug, but the test that proves it is in an alien file --- html5lib/serializer/htmlserializer.py | 6 ++-- html5lib/tests/test_six_encoding.py | 17 +++++++++ html5lib/treewalkers/_base.py | 50 +++++++++++++-------------- 3 files changed, 45 insertions(+), 28 deletions(-) create mode 100644 html5lib/tests/test_six_encoding.py diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 594b5161..6f218e67 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type +from six import text_type, string_types import gettext _ = gettext.gettext @@ -154,14 +154,14 @@ def __init__(self, **kwargs): self.strict = False def encode(self, string): - assert(isinstance(string, text_type)) + assert(isinstance(string, string_types)) if self.encoding: return string.encode(self.encoding, unicode_encode_errors) else: return string def encodeStrict(self, string): - assert(isinstance(string, text_type)) + assert(isinstance(string, string_types)) if self.encoding: return string.encode(self.encoding, "strict") else: diff --git a/html5lib/tests/test_six_encoding.py b/html5lib/tests/test_six_encoding.py new file mode 100644 index 00000000..1cceb551 --- /dev/null +++ b/html5lib/tests/test_six_encoding.py @@ -0,0 +1,17 @@ + +from html5lib import html5parser, treewalkers, serializer +from nose.tools import eq_ + +def test_treewalker6(): + """Str/Unicode mix. If str attrs added to tree""" + + text = 'Example' + end_text = 'Example' + parser = html5parser.HTMLParser() + walker = treewalkers.getTreeWalker('etree') + serializr = serializer.HTMLSerializer(quote_attr_values=True) + domtree = parser.parseFragment(text) + + # at this point domtree should be a DOCUMENT_FRAGMENT + domtree[0].set('class', 'test123') + eq_(end_text, serializr.render(walker(domtree))) diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index 48b6da48..f8418462 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type +from six import text_type, string_types import gettext _ = gettext.gettext @@ -19,43 +19,43 @@ def error(self, msg): return {"type": "SerializeError", "data": msg} def emptyTag(self, namespace, name, attrs, hasChildren=False): - assert namespace is None or isinstance(namespace, text_type), type(namespace) - assert isinstance(name, text_type), type(name) - assert all((namespace is None or isinstance(namespace, text_type)) and - isinstance(name, text_type) and - isinstance(value, text_type) + assert namespace is None or isinstance(namespace, string_types), type(namespace) + assert isinstance(name, string_types), type(name) + assert all((namespace is None or isinstance(namespace, string_types)) and + isinstance(name, string_types) and + isinstance(value, string_types) for (namespace, name), value in attrs.items()) - yield {"type": "EmptyTag", "name": name, - "namespace": namespace, + yield {"type": "EmptyTag", "name": text_type(name), + "namespace": text_type(namespace), "data": attrs} if hasChildren: yield self.error(_("Void element has children")) def startTag(self, namespace, name, attrs): - assert namespace is None or isinstance(namespace, text_type), type(namespace) - assert isinstance(name, text_type), type(name) - assert all((namespace is None or isinstance(namespace, text_type)) and - isinstance(name, text_type) and - isinstance(value, text_type) + assert namespace is None or isinstance(namespace, string_types), type(namespace) + assert isinstance(name, string_types), type(name) + assert all((namespace is None or isinstance(namespace, string_types)) and + isinstance(name, string_types) and + isinstance(value, string_types) for (namespace, name), value in attrs.items()) return {"type": "StartTag", - "name": name, - "namespace": namespace, + "name": text_type(name), + "namespace": text_type(namespace), "data": attrs} def endTag(self, namespace, name): - assert namespace is None or isinstance(namespace, text_type), type(namespace) - assert isinstance(name, text_type), type(namespace) + assert namespace is None or isinstance(namespace, string_types), type(namespace) + assert isinstance(name, string_types), type(namespace) return {"type": "EndTag", - "name": name, - "namespace": namespace, + "name": text_type(name), + "namespace": text_type(namespace), "data": {}} def text(self, data): - assert isinstance(data, text_type), type(data) + assert isinstance(data, string_types), type(data) data = data middle = data.lstrip(spaceCharacters) @@ -71,14 +71,14 @@ def text(self, data): yield {"type": "SpaceCharacters", "data": right} def comment(self, data): - assert isinstance(data, text_type), type(data) + assert isinstance(data, string_types), type(data) return {"type": "Comment", "data": data} def doctype(self, name, publicId=None, systemId=None, correct=True): - assert name is None or isinstance(name, text_type), type(name) - assert publicId is None or isinstance(publicId, text_type), type(publicId) - assert systemId is None or isinstance(systemId, text_type), type(systemId) + assert name is None or isinstance(name, string_types), type(name) + assert publicId is None or isinstance(publicId, string_types), type(publicId) + assert systemId is None or isinstance(systemId, string_types), type(systemId) return {"type": "Doctype", "name": name if name is not None else "", @@ -87,7 +87,7 @@ def doctype(self, name, publicId=None, systemId=None, correct=True): "correct": correct} def entity(self, name): - assert isinstance(name, text_type), type(name) + assert isinstance(name, string_types), type(name) return {"type": "Entity", "name": name} From de312430310886bd74ef6ce39b7f037e26bc19d5 Mon Sep 17 00:00:00 2001 From: Marc DM Date: Sun, 16 Jun 2013 19:18:11 -0500 Subject: [PATCH 2/6] tweaks so that Travis doesn't complain --- html5lib/serializer/htmlserializer.py | 2 +- html5lib/tests/test_six_encoding.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 6f218e67..4584fdd0 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import text_type, string_types +from six import string_types import gettext _ = gettext.gettext diff --git a/html5lib/tests/test_six_encoding.py b/html5lib/tests/test_six_encoding.py index 1cceb551..de6f934e 100644 --- a/html5lib/tests/test_six_encoding.py +++ b/html5lib/tests/test_six_encoding.py @@ -2,9 +2,10 @@ from html5lib import html5parser, treewalkers, serializer from nose.tools import eq_ + def test_treewalker6(): """Str/Unicode mix. If str attrs added to tree""" - + text = 'Example' end_text = 'Example' parser = html5parser.HTMLParser() From 07bbab1a3f4b005d60e0cd0b514b8cc6c646a530 Mon Sep 17 00:00:00 2001 From: Marc DM Date: Tue, 18 Jun 2013 18:15:03 -0500 Subject: [PATCH 3/6] good good, all tests pass except the one that needs to go --- html5lib/serializer/htmlserializer.py | 6 ++-- html5lib/tests/test_six_encoding.py | 16 +++++---- html5lib/tests/test_treewalkers.py | 38 ++++++++++++++++++++ html5lib/treewalkers/_base.py | 52 ++++++++++++++++++--------- 4 files changed, 86 insertions(+), 26 deletions(-) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 4584fdd0..594b5161 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -1,5 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import string_types +from six import text_type import gettext _ = gettext.gettext @@ -154,14 +154,14 @@ def __init__(self, **kwargs): self.strict = False def encode(self, string): - assert(isinstance(string, string_types)) + assert(isinstance(string, text_type)) if self.encoding: return string.encode(self.encoding, unicode_encode_errors) else: return string def encodeStrict(self, string): - assert(isinstance(string, string_types)) + assert(isinstance(string, text_type)) if self.encoding: return string.encode(self.encoding, "strict") else: diff --git a/html5lib/tests/test_six_encoding.py b/html5lib/tests/test_six_encoding.py index de6f934e..39265ef1 100644 --- a/html5lib/tests/test_six_encoding.py +++ b/html5lib/tests/test_six_encoding.py @@ -1,18 +1,20 @@ -from html5lib import html5parser, treewalkers, serializer -from nose.tools import eq_ +from html5lib import html5parser, treewalkers, treebuilders, serializer -def test_treewalker6(): +def test_treewalker_six_mix(): """Str/Unicode mix. If str attrs added to tree""" text = 'Example' - end_text = 'Example' - parser = html5parser.HTMLParser() - walker = treewalkers.getTreeWalker('etree') + end_texts = ('Example', + 'Example') + parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom')) + walker = treewalkers.getTreeWalker('dom') serializr = serializer.HTMLSerializer(quote_attr_values=True) domtree = parser.parseFragment(text) # at this point domtree should be a DOCUMENT_FRAGMENT domtree[0].set('class', 'test123') - eq_(end_text, serializr.render(walker(domtree))) + out = serializr.render(walker(domtree)) + if not out in end_texts: + raise AssertionError('%r not in %r' % (out, end_texts)) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 549a4747..737e5140 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -310,3 +310,41 @@ def test_treewalker(): "document")] errors = errors.split("\n") yield runTreewalkerTest, innerHTML, input, expected, errors, treeCls + + +def set_attribute_on_first_child(docfrag, name, value, treeName): + """naively sets an attribute on the first child of the document + fragment passed in""" + setter = {'ElementTree': lambda d: d[0].set, + 'DOM': lambda d: d.firstChild.setAttribute} + setter['PullDOM'] = setter['DOM'] + try: + setter.get(treeName, setter['ElementTree'])(docfrag)(name, value) + except TypeError: + setter['DOM'](docfrag)(name, value) + + +def runTreewalkerEditTest(intext, expected, attrs_to_add, tree): + """tests what happens when we add attributes to the intext""" + treeName, treeClass = tree + parser = html5parser.HTMLParser(tree=treeClass["builder"]) + document = parser.parseFragment(intext) + for nom, val in attrs_to_add: + set_attribute_on_first_child(document, nom, val, treeName) + + document = treeClass.get("adapter", lambda x: x)(document) + output = convertTokens(treeClass["walker"](document)) + output = attrlist.sub(sortattrs, output) + if not output in expected: + raise AssertionError('%r not in %r' % (output, expected)) + + +def test_treewalker_six_mix(): + """Str/Unicode mix. If str attrs added to tree""" + + intext = 'Example' + expected = '\n class="test123"\n href="http://example.com"\n "Example"' + attrs = [('class', 'test123')] + + for tree in treeTypes.items(): + yield runTreewalkerEditTest, intext, expected, attrs, tree diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index f8418462..3a61ad19 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -8,6 +8,24 @@ spaceCharacters = "".join(spaceCharacters) +def to_text(s, blank_if_none=True): + """Wrapper around six.text_type to convert None to empty string""" + if s is None: + if blank_if_none: + return "" + else: + return None + elif isinstance(s, text_type): + return s + else: + return text_type(s) + + +def is_text_or_none(string): + """Wrapper around isinstance(string_types) or is None""" + return string is None or isinstance(string, string_types) + + class TreeWalker(object): def __init__(self, tree): self.tree = tree @@ -26,8 +44,8 @@ def emptyTag(self, namespace, name, attrs, hasChildren=False): isinstance(value, string_types) for (namespace, name), value in attrs.items()) - yield {"type": "EmptyTag", "name": text_type(name), - "namespace": text_type(namespace), + yield {"type": "EmptyTag", "name": to_text(name, False), + "namespace": to_text(namespace), "data": attrs} if hasChildren: yield self.error(_("Void element has children")) @@ -42,22 +60,24 @@ def startTag(self, namespace, name, attrs): return {"type": "StartTag", "name": text_type(name), - "namespace": text_type(namespace), - "data": attrs} + "namespace": to_text(namespace), + "data": dict(((to_text(ns), to_text(na, False)), + to_text(va, False)) + for (ns, na), va in attrs.items())} def endTag(self, namespace, name): assert namespace is None or isinstance(namespace, string_types), type(namespace) assert isinstance(name, string_types), type(namespace) return {"type": "EndTag", - "name": text_type(name), - "namespace": text_type(namespace), + "name": to_text(name, False), + "namespace": to_text(namespace), "data": {}} def text(self, data): assert isinstance(data, string_types), type(data) - data = data + data = to_text(data) middle = data.lstrip(spaceCharacters) left = data[:len(data) - len(middle)] if left: @@ -73,23 +93,23 @@ def text(self, data): def comment(self, data): assert isinstance(data, string_types), type(data) - return {"type": "Comment", "data": data} + return {"type": "Comment", "data": text_type(data)} def doctype(self, name, publicId=None, systemId=None, correct=True): - assert name is None or isinstance(name, string_types), type(name) - assert publicId is None or isinstance(publicId, string_types), type(publicId) - assert systemId is None or isinstance(systemId, string_types), type(systemId) + assert is_text_or_none(name), type(name) + assert is_text_or_none(publicId), type(publicId) + assert is_text_or_none(systemId), type(systemId) return {"type": "Doctype", - "name": name if name is not None else "", - "publicId": publicId, - "systemId": systemId, - "correct": correct} + "name": to_text(name), + "publicId": to_text(publicId), + "systemId": to_text(systemId), + "correct": to_text(correct)} def entity(self, name): assert isinstance(name, string_types), type(name) - return {"type": "Entity", "name": name} + return {"type": "Entity", "name": text_type(name)} def unknown(self, nodeType): return self.error(_("Unknown node type: ") + nodeType) From b5c80859900f6f60d512a583c66a5bb9c1eb45fa Mon Sep 17 00:00:00 2001 From: Marc DM Date: Tue, 18 Jun 2013 19:01:58 -0500 Subject: [PATCH 4/6] lots of lessons learned, hopefully the test is more useful now --- html5lib/tests/test_six_encoding.py | 20 -------------------- html5lib/tests/test_treewalkers.py | 26 +++++++++++++++++--------- 2 files changed, 17 insertions(+), 29 deletions(-) delete mode 100644 html5lib/tests/test_six_encoding.py diff --git a/html5lib/tests/test_six_encoding.py b/html5lib/tests/test_six_encoding.py deleted file mode 100644 index 39265ef1..00000000 --- a/html5lib/tests/test_six_encoding.py +++ /dev/null @@ -1,20 +0,0 @@ - -from html5lib import html5parser, treewalkers, treebuilders, serializer - - -def test_treewalker_six_mix(): - """Str/Unicode mix. If str attrs added to tree""" - - text = 'Example' - end_texts = ('Example', - 'Example') - parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom')) - walker = treewalkers.getTreeWalker('dom') - serializr = serializer.HTMLSerializer(quote_attr_values=True) - domtree = parser.parseFragment(text) - - # at this point domtree should be a DOCUMENT_FRAGMENT - domtree[0].set('class', 'test123') - out = serializr.render(walker(domtree)) - if not out in end_texts: - raise AssertionError('%r not in %r' % (out, end_texts)) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index 737e5140..e95ae694 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -317,11 +317,11 @@ def set_attribute_on_first_child(docfrag, name, value, treeName): fragment passed in""" setter = {'ElementTree': lambda d: d[0].set, 'DOM': lambda d: d.firstChild.setAttribute} - setter['PullDOM'] = setter['DOM'] + setter['cElementTree'] = setter['ElementTree'] try: - setter.get(treeName, setter['ElementTree'])(docfrag)(name, value) - except TypeError: - setter['DOM'](docfrag)(name, value) + setter.get(treeName, setter['DOM'])(docfrag)(name, value) + except AttributeError: + setter['ElementTree'](docfrag)(name, value) def runTreewalkerEditTest(intext, expected, attrs_to_add, tree): @@ -336,15 +336,23 @@ def runTreewalkerEditTest(intext, expected, attrs_to_add, tree): output = convertTokens(treeClass["walker"](document)) output = attrlist.sub(sortattrs, output) if not output in expected: - raise AssertionError('%r not in %r' % (output, expected)) + raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output)) def test_treewalker_six_mix(): """Str/Unicode mix. If str attrs added to tree""" - intext = 'Example' - expected = '\n class="test123"\n href="http://example.com"\n "Example"' - attrs = [('class', 'test123')] + # ToDo: Find a better way to specify that the attribute value is a bytestring + sm_tests = [ + ('Example', + [(str('class'), str('test123'))], + '\n class="test123"\n href="http://example.com"\n "Example"'), + + ('', + [(str('rel'), str('alternate'))], + '\n href="http://example.com/cow"\n rel="alternate"\n "Example"') + ] for tree in treeTypes.items(): - yield runTreewalkerEditTest, intext, expected, attrs, tree + for intext, attrs, expected in sm_tests: + yield runTreewalkerEditTest, intext, expected, attrs, tree From 4a7adf281e0a670cb124dda03b7391a8974837bf Mon Sep 17 00:00:00 2001 From: Marc DM Date: Sat, 22 Jun 2013 22:31:02 -0500 Subject: [PATCH 5/6] minor update for clarity --- html5lib/treewalkers/_base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py index 3a61ad19..bd0e0c58 100644 --- a/html5lib/treewalkers/_base.py +++ b/html5lib/treewalkers/_base.py @@ -61,9 +61,9 @@ def startTag(self, namespace, name, attrs): return {"type": "StartTag", "name": text_type(name), "namespace": to_text(namespace), - "data": dict(((to_text(ns), to_text(na, False)), - to_text(va, False)) - for (ns, na), va in attrs.items())} + "data": dict(((to_text(namespace, False), to_text(name)), + to_text(value, False)) + for (namespace, name), value in attrs.items())} def endTag(self, namespace, name): assert namespace is None or isinstance(namespace, string_types), type(namespace) From 841e67c902156af640ce9f7eabb6ac1ff89afe22 Mon Sep 17 00:00:00 2001 From: Marc DM Date: Sat, 22 Jun 2013 23:08:51 -0500 Subject: [PATCH 6/6] updated test comment --- html5lib/tests/test_treewalkers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py index e95ae694..8c120163 100644 --- a/html5lib/tests/test_treewalkers.py +++ b/html5lib/tests/test_treewalkers.py @@ -342,7 +342,12 @@ def runTreewalkerEditTest(intext, expected, attrs_to_add, tree): def test_treewalker_six_mix(): """Str/Unicode mix. If str attrs added to tree""" - # ToDo: Find a better way to specify that the attribute value is a bytestring + # On Python 2.x string literals are of type str. Unless, like this + # file, the programmer imports unicode_literals from __future__. + # In that case, string literals become objects of type unicode. + + # This test simulates a Py2 user, modifying attributes on a document + # fragment but not using the u'' syntax nor importing unicode_literals sm_tests = [ ('Example', [(str('class'), str('test123'))],