From c8e024b8ae8712baa79872ac0aed6d0b82e06fa2 Mon Sep 17 00:00:00 2001
From: Marc DM <marcdm@fraser>
Date: Sun, 16 Jun 2013 18:15:56 -0500
Subject: [PATCH 1/6] fixed the unicode/str/six bug, but the test that proves
 it is in an alien file

---
 html5lib/serializer/htmlserializer.py |  6 ++--
 html5lib/tests/test_six_encoding.py   | 17 +++++++++
 html5lib/treewalkers/_base.py         | 50 +++++++++++++--------------
 3 files changed, 45 insertions(+), 28 deletions(-)
 create mode 100644 html5lib/tests/test_six_encoding.py
diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
index 594b5161..6f218e67 100644
--- a/html5lib/serializer/htmlserializer.py
+++ b/html5lib/serializer/htmlserializer.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type
+from six import text_type, string_types
 
 import gettext
 _ = gettext.gettext
@@ -154,14 +154,14 @@ def __init__(self, **kwargs):
         self.strict = False
 
     def encode(self, string):
-        assert(isinstance(string, text_type))
+        assert(isinstance(string, string_types))
         if self.encoding:
             return string.encode(self.encoding, unicode_encode_errors)
         else:
             return string
 
     def encodeStrict(self, string):
-        assert(isinstance(string, text_type))
+        assert(isinstance(string, string_types))
         if self.encoding:
             return string.encode(self.encoding, "strict")
         else:
diff --git a/html5lib/tests/test_six_encoding.py b/html5lib/tests/test_six_encoding.py
new file mode 100644
index 00000000..1cceb551
--- /dev/null
+++ b/html5lib/tests/test_six_encoding.py
@@ -0,0 +1,17 @@
+
+from html5lib import html5parser, treewalkers, serializer
+from nose.tools import eq_
+
+def test_treewalker6():
+    """Str/Unicode mix. If str attrs added to tree"""
+    
+    text = '<a href="http://example.com">Example</a>'
+    end_text = '<a href="http://example.com" class="test123">Example</a>'
+    parser = html5parser.HTMLParser()
+    walker = treewalkers.getTreeWalker('etree')
+    serializr = serializer.HTMLSerializer(quote_attr_values=True)
+    domtree = parser.parseFragment(text)
+
+    # at this point domtree should be a DOCUMENT_FRAGMENT
+    domtree[0].set('class', 'test123')
+    eq_(end_text, serializr.render(walker(domtree)))
diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py
index 48b6da48..f8418462 100644
--- a/html5lib/treewalkers/_base.py
+++ b/html5lib/treewalkers/_base.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type
+from six import text_type, string_types
 
 import gettext
 _ = gettext.gettext
@@ -19,43 +19,43 @@ def error(self, msg):
         return {"type": "SerializeError", "data": msg}
 
     def emptyTag(self, namespace, name, attrs, hasChildren=False):
-        assert namespace is None or isinstance(namespace, text_type), type(namespace)
-        assert isinstance(name, text_type), type(name)
-        assert all((namespace is None or isinstance(namespace, text_type)) and
-                   isinstance(name, text_type) and
-                   isinstance(value, text_type)
+        assert namespace is None or isinstance(namespace, string_types), type(namespace)
+        assert isinstance(name, string_types), type(name)
+        assert all((namespace is None or isinstance(namespace, string_types)) and
+                   isinstance(name, string_types) and
+                   isinstance(value, string_types)
                    for (namespace, name), value in attrs.items())
 
-        yield {"type": "EmptyTag", "name": name,
-               "namespace": namespace,
+        yield {"type": "EmptyTag", "name": text_type(name),
+               "namespace": text_type(namespace),
                "data": attrs}
         if hasChildren:
             yield self.error(_("Void element has children"))
 
     def startTag(self, namespace, name, attrs):
-        assert namespace is None or isinstance(namespace, text_type), type(namespace)
-        assert isinstance(name, text_type), type(name)
-        assert all((namespace is None or isinstance(namespace, text_type)) and
-                   isinstance(name, text_type) and
-                   isinstance(value, text_type)
+        assert namespace is None or isinstance(namespace, string_types), type(namespace)
+        assert isinstance(name, string_types), type(name)
+        assert all((namespace is None or isinstance(namespace, string_types)) and
+                   isinstance(name, string_types) and
+                   isinstance(value, string_types)
                    for (namespace, name), value in attrs.items())
 
         return {"type": "StartTag",
-                "name": name,
-                "namespace": namespace,
+                "name": text_type(name),
+                "namespace": text_type(namespace),
                 "data": attrs}
 
     def endTag(self, namespace, name):
-        assert namespace is None or isinstance(namespace, text_type), type(namespace)
-        assert isinstance(name, text_type), type(namespace)
+        assert namespace is None or isinstance(namespace, string_types), type(namespace)
+        assert isinstance(name, string_types), type(namespace)
 
         return {"type": "EndTag",
-                "name": name,
-                "namespace": namespace,
+                "name": text_type(name),
+                "namespace": text_type(namespace),
                 "data": {}}
 
     def text(self, data):
-        assert isinstance(data, text_type), type(data)
+        assert isinstance(data, string_types), type(data)
 
         data = data
         middle = data.lstrip(spaceCharacters)
@@ -71,14 +71,14 @@ def text(self, data):
             yield {"type": "SpaceCharacters", "data": right}
 
     def comment(self, data):
-        assert isinstance(data, text_type), type(data)
+        assert isinstance(data, string_types), type(data)
 
         return {"type": "Comment", "data": data}
 
     def doctype(self, name, publicId=None, systemId=None, correct=True):
-        assert name is None or isinstance(name, text_type), type(name)
-        assert publicId is None or isinstance(publicId, text_type), type(publicId)
-        assert systemId is None or isinstance(systemId, text_type), type(systemId)
+        assert name is None or isinstance(name, string_types), type(name)
+        assert publicId is None or isinstance(publicId, string_types), type(publicId)
+        assert systemId is None or isinstance(systemId, string_types), type(systemId)
 
         return {"type": "Doctype",
                 "name": name if name is not None else "",
@@ -87,7 +87,7 @@ def doctype(self, name, publicId=None, systemId=None, correct=True):
                 "correct": correct}
 
     def entity(self, name):
-        assert isinstance(name, text_type), type(name)
+        assert isinstance(name, string_types), type(name)
 
         return {"type": "Entity", "name": name}
 

From de312430310886bd74ef6ce39b7f037e26bc19d5 Mon Sep 17 00:00:00 2001
From: Marc DM <marcdm@fraser>
Date: Sun, 16 Jun 2013 19:18:11 -0500
Subject: [PATCH 2/6] tweaks so that Travis doesn't complain

---
 html5lib/serializer/htmlserializer.py | 2 +-
 html5lib/tests/test_six_encoding.py   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
index 6f218e67..4584fdd0 100644
--- a/html5lib/serializer/htmlserializer.py
+++ b/html5lib/serializer/htmlserializer.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type, string_types
+from six import string_types
 
 import gettext
 _ = gettext.gettext
diff --git a/html5lib/tests/test_six_encoding.py b/html5lib/tests/test_six_encoding.py
index 1cceb551..de6f934e 100644
--- a/html5lib/tests/test_six_encoding.py
+++ b/html5lib/tests/test_six_encoding.py
@@ -2,9 +2,10 @@
 from html5lib import html5parser, treewalkers, serializer
 from nose.tools import eq_
 
+
 def test_treewalker6():
     """Str/Unicode mix. If str attrs added to tree"""
-    
+
     text = '<a href="http://example.com">Example</a>'
     end_text = '<a href="http://example.com" class="test123">Example</a>'
     parser = html5parser.HTMLParser()

From 07bbab1a3f4b005d60e0cd0b514b8cc6c646a530 Mon Sep 17 00:00:00 2001
From: Marc DM <marcdm@fraser>
Date: Tue, 18 Jun 2013 18:15:03 -0500
Subject: [PATCH 3/6] good good, all tests pass except the one that needs to go

---
 html5lib/serializer/htmlserializer.py |  6 ++--
 html5lib/tests/test_six_encoding.py   | 16 +++++----
 html5lib/tests/test_treewalkers.py    | 38 ++++++++++++++++++++
 html5lib/treewalkers/_base.py         | 52 ++++++++++++++++++---------
 4 files changed, 86 insertions(+), 26 deletions(-)

diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
index 4584fdd0..594b5161 100644
--- a/html5lib/serializer/htmlserializer.py
+++ b/html5lib/serializer/htmlserializer.py
@@ -1,5 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import string_types
+from six import text_type
 
 import gettext
 _ = gettext.gettext
@@ -154,14 +154,14 @@ def __init__(self, **kwargs):
         self.strict = False
 
     def encode(self, string):
-        assert(isinstance(string, string_types))
+        assert(isinstance(string, text_type))
         if self.encoding:
             return string.encode(self.encoding, unicode_encode_errors)
         else:
             return string
 
     def encodeStrict(self, string):
-        assert(isinstance(string, string_types))
+        assert(isinstance(string, text_type))
         if self.encoding:
             return string.encode(self.encoding, "strict")
         else:
diff --git a/html5lib/tests/test_six_encoding.py b/html5lib/tests/test_six_encoding.py
index de6f934e..39265ef1 100644
--- a/html5lib/tests/test_six_encoding.py
+++ b/html5lib/tests/test_six_encoding.py
@@ -1,18 +1,20 @@
 
-from html5lib import html5parser, treewalkers, serializer
-from nose.tools import eq_
+from html5lib import html5parser, treewalkers, treebuilders, serializer
 
 
-def test_treewalker6():
+def test_treewalker_six_mix():
     """Str/Unicode mix. If str attrs added to tree"""
 
     text = '<a href="http://example.com">Example</a>'
-    end_text = '<a href="http://example.com" class="test123">Example</a>'
-    parser = html5parser.HTMLParser()
-    walker = treewalkers.getTreeWalker('etree')
+    end_texts = ('<a href="http://example.com" class="test123">Example</a>',
+                 '<a class="test123" href="http://example.com">Example</a>')
+    parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+    walker = treewalkers.getTreeWalker('dom')
     serializr = serializer.HTMLSerializer(quote_attr_values=True)
     domtree = parser.parseFragment(text)
 
     # at this point domtree should be a DOCUMENT_FRAGMENT
     domtree[0].set('class', 'test123')
-    eq_(end_text, serializr.render(walker(domtree)))
+    out = serializr.render(walker(domtree))
+    if not out in end_texts:
+        raise AssertionError('%r not in %r' % (out, end_texts))
diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index 549a4747..737e5140 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -310,3 +310,41 @@ def test_treewalker():
                                                                "document")]
                 errors = errors.split("\n")
                 yield runTreewalkerTest, innerHTML, input, expected, errors, treeCls
+
+
+def set_attribute_on_first_child(docfrag, name, value, treeName):
+    """naively sets an attribute on the first child of the document
+    fragment passed in"""
+    setter = {'ElementTree': lambda d: d[0].set,
+              'DOM': lambda d: d.firstChild.setAttribute}
+    setter['PullDOM'] = setter['DOM']
+    try:
+        setter.get(treeName, setter['ElementTree'])(docfrag)(name, value)
+    except TypeError:
+        setter['DOM'](docfrag)(name, value)
+
+
+def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
+    """tests what happens when we add attributes to the intext"""
+    treeName, treeClass = tree
+    parser = html5parser.HTMLParser(tree=treeClass["builder"])
+    document = parser.parseFragment(intext)
+    for nom, val in attrs_to_add:
+        set_attribute_on_first_child(document, nom, val, treeName)
+
+    document = treeClass.get("adapter", lambda x: x)(document)
+    output = convertTokens(treeClass["walker"](document))
+    output = attrlist.sub(sortattrs, output)
+    if not output in expected:
+        raise AssertionError('%r not in %r' % (output, expected))
+
+
+def test_treewalker_six_mix():
+    """Str/Unicode mix. If str attrs added to tree"""
+
+    intext = '<a href="http://example.com">Example</a>'
+    expected = '<a>\n  class="test123"\n  href="http://example.com"\n  "Example"'
+    attrs = [('class', 'test123')]
+
+    for tree in treeTypes.items():
+        yield runTreewalkerEditTest, intext, expected, attrs, tree
diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py
index f8418462..3a61ad19 100644
--- a/html5lib/treewalkers/_base.py
+++ b/html5lib/treewalkers/_base.py
@@ -8,6 +8,24 @@
 spaceCharacters = "".join(spaceCharacters)
 
 
+def to_text(s, blank_if_none=True):
+    """Wrapper around six.text_type to convert None to empty string"""
+    if s is None:
+        if blank_if_none:
+            return ""
+        else:
+            return None
+    elif isinstance(s, text_type):
+        return s
+    else:
+        return text_type(s)
+
+
+def is_text_or_none(string):
+    """Wrapper around isinstance(string_types) or is None"""
+    return string is None or isinstance(string, string_types)
+
+
 class TreeWalker(object):
     def __init__(self, tree):
         self.tree = tree
@@ -26,8 +44,8 @@ def emptyTag(self, namespace, name, attrs, hasChildren=False):
                    isinstance(value, string_types)
                    for (namespace, name), value in attrs.items())
 
-        yield {"type": "EmptyTag", "name": text_type(name),
-               "namespace": text_type(namespace),
+        yield {"type": "EmptyTag", "name": to_text(name, False),
+               "namespace": to_text(namespace),
                "data": attrs}
         if hasChildren:
             yield self.error(_("Void element has children"))
@@ -42,22 +60,24 @@ def startTag(self, namespace, name, attrs):
 
         return {"type": "StartTag",
                 "name": text_type(name),
-                "namespace": text_type(namespace),
-                "data": attrs}
+                "namespace": to_text(namespace),
+                "data": dict(((to_text(ns), to_text(na, False)),
+                              to_text(va, False))
+                             for (ns, na), va in attrs.items())}
 
     def endTag(self, namespace, name):
         assert namespace is None or isinstance(namespace, string_types), type(namespace)
         assert isinstance(name, string_types), type(namespace)
 
         return {"type": "EndTag",
-                "name": text_type(name),
-                "namespace": text_type(namespace),
+                "name": to_text(name, False),
+                "namespace": to_text(namespace),
                 "data": {}}
 
     def text(self, data):
         assert isinstance(data, string_types), type(data)
 
-        data = data
+        data = to_text(data)
         middle = data.lstrip(spaceCharacters)
         left = data[:len(data) - len(middle)]
         if left:
@@ -73,23 +93,23 @@ def text(self, data):
     def comment(self, data):
         assert isinstance(data, string_types), type(data)
 
-        return {"type": "Comment", "data": data}
+        return {"type": "Comment", "data": text_type(data)}
 
     def doctype(self, name, publicId=None, systemId=None, correct=True):
-        assert name is None or isinstance(name, string_types), type(name)
-        assert publicId is None or isinstance(publicId, string_types), type(publicId)
-        assert systemId is None or isinstance(systemId, string_types), type(systemId)
+        assert is_text_or_none(name), type(name)
+        assert is_text_or_none(publicId), type(publicId)
+        assert is_text_or_none(systemId), type(systemId)
 
         return {"type": "Doctype",
-                "name": name if name is not None else "",
-                "publicId": publicId,
-                "systemId": systemId,
-                "correct": correct}
+                "name": to_text(name),
+                "publicId": to_text(publicId),
+                "systemId": to_text(systemId),
+                "correct": to_text(correct)}
 
     def entity(self, name):
         assert isinstance(name, string_types), type(name)
 
-        return {"type": "Entity", "name": name}
+        return {"type": "Entity", "name": text_type(name)}
 
     def unknown(self, nodeType):
         return self.error(_("Unknown node type: ") + nodeType)

From b5c80859900f6f60d512a583c66a5bb9c1eb45fa Mon Sep 17 00:00:00 2001
From: Marc DM <marcdm@fraser>
Date: Tue, 18 Jun 2013 19:01:58 -0500
Subject: [PATCH 4/6] lots of lessons learned, hopefully the test is more
 useful now

---
 html5lib/tests/test_six_encoding.py | 20 --------------------
 html5lib/tests/test_treewalkers.py  | 26 +++++++++++++++++---------
 2 files changed, 17 insertions(+), 29 deletions(-)
 delete mode 100644 html5lib/tests/test_six_encoding.py

diff --git a/html5lib/tests/test_six_encoding.py b/html5lib/tests/test_six_encoding.py
deleted file mode 100644
index 39265ef1..00000000
--- a/html5lib/tests/test_six_encoding.py
+++ /dev/null
@@ -1,20 +0,0 @@
-
-from html5lib import html5parser, treewalkers, treebuilders, serializer
-
-
-def test_treewalker_six_mix():
-    """Str/Unicode mix. If str attrs added to tree"""
-
-    text = '<a href="http://example.com">Example</a>'
-    end_texts = ('<a href="http://example.com" class="test123">Example</a>',
-                 '<a class="test123" href="http://example.com">Example</a>')
-    parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
-    walker = treewalkers.getTreeWalker('dom')
-    serializr = serializer.HTMLSerializer(quote_attr_values=True)
-    domtree = parser.parseFragment(text)
-
-    # at this point domtree should be a DOCUMENT_FRAGMENT
-    domtree[0].set('class', 'test123')
-    out = serializr.render(walker(domtree))
-    if not out in end_texts:
-        raise AssertionError('%r not in %r' % (out, end_texts))
diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index 737e5140..e95ae694 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -317,11 +317,11 @@ def set_attribute_on_first_child(docfrag, name, value, treeName):
     fragment passed in"""
     setter = {'ElementTree': lambda d: d[0].set,
               'DOM': lambda d: d.firstChild.setAttribute}
-    setter['PullDOM'] = setter['DOM']
+    setter['cElementTree'] = setter['ElementTree']
     try:
-        setter.get(treeName, setter['ElementTree'])(docfrag)(name, value)
-    except TypeError:
-        setter['DOM'](docfrag)(name, value)
+        setter.get(treeName, setter['DOM'])(docfrag)(name, value)
+    except AttributeError:
+        setter['ElementTree'](docfrag)(name, value)
 
 
 def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
@@ -336,15 +336,23 @@ def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
     output = convertTokens(treeClass["walker"](document))
     output = attrlist.sub(sortattrs, output)
     if not output in expected:
-        raise AssertionError('%r not in %r' % (output, expected))
+        raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
 
 
 def test_treewalker_six_mix():
     """Str/Unicode mix. If str attrs added to tree"""
 
-    intext = '<a href="http://example.com">Example</a>'
-    expected = '<a>\n  class="test123"\n  href="http://example.com"\n  "Example"'
-    attrs = [('class', 'test123')]
+    # ToDo: Find a better way to specify that the attribute value is a bytestring
+    sm_tests = [
+        ('<a href="http://example.com">Example</a>',
+         [(str('class'), str('test123'))],
+         '<a>\n  class="test123"\n  href="http://example.com"\n  "Example"'),
+
+        ('<link href="http://example.com/cow">',
+         [(str('rel'), str('alternate'))],
+         '<link>\n  href="http://example.com/cow"\n  rel="alternate"\n  "Example"')
+    ]
 
     for tree in treeTypes.items():
-        yield runTreewalkerEditTest, intext, expected, attrs, tree
+        for intext, attrs, expected in sm_tests:
+            yield runTreewalkerEditTest, intext, expected, attrs, tree

From 4a7adf281e0a670cb124dda03b7391a8974837bf Mon Sep 17 00:00:00 2001
From: Marc DM <marcdm@fraser>
Date: Sat, 22 Jun 2013 22:31:02 -0500
Subject: [PATCH 5/6] minor update for clarity

---
 html5lib/treewalkers/_base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/html5lib/treewalkers/_base.py b/html5lib/treewalkers/_base.py
index 3a61ad19..bd0e0c58 100644
--- a/html5lib/treewalkers/_base.py
+++ b/html5lib/treewalkers/_base.py
@@ -61,9 +61,9 @@ def startTag(self, namespace, name, attrs):
         return {"type": "StartTag",
                 "name": text_type(name),
                 "namespace": to_text(namespace),
-                "data": dict(((to_text(ns), to_text(na, False)),
-                              to_text(va, False))
-                             for (ns, na), va in attrs.items())}
+                "data": dict(((to_text(namespace, False), to_text(name)),
+                              to_text(value, False))
+                             for (namespace, name), value in attrs.items())}
 
     def endTag(self, namespace, name):
         assert namespace is None or isinstance(namespace, string_types), type(namespace)

From 841e67c902156af640ce9f7eabb6ac1ff89afe22 Mon Sep 17 00:00:00 2001
From: Marc DM <marcdm@fraser>
Date: Sat, 22 Jun 2013 23:08:51 -0500
Subject: [PATCH 6/6] updated test comment

---
 html5lib/tests/test_treewalkers.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/html5lib/tests/test_treewalkers.py b/html5lib/tests/test_treewalkers.py
index e95ae694..8c120163 100644
--- a/html5lib/tests/test_treewalkers.py
+++ b/html5lib/tests/test_treewalkers.py
@@ -342,7 +342,12 @@ def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
 def test_treewalker_six_mix():
     """Str/Unicode mix. If str attrs added to tree"""
 
-    # ToDo: Find a better way to specify that the attribute value is a bytestring
+    # On Python 2.x string literals are of type str. Unless, like this
+    # file, the programmer imports unicode_literals from __future__.
+    # In that case, string literals become objects of type unicode.
+
+    # This test simulates a Py2 user, modifying attributes on a document
+    # fragment but not using the u'' syntax nor importing unicode_literals
     sm_tests = [
         ('<a href="http://example.com">Example</a>',
          [(str('class'), str('test123'))],