From 82cf8641060725ccf5e4e00e6cc3b60191409e2c Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Fri, 7 Sep 2012 14:51:44 -0700
Subject: [PATCH 01/38] Removed list(set(...)) de-duplicate operations in
 ParseResults.__init__ as they destory the ordering of urls, users etc in the
 tweet.  The list(set( operation on replies was dangerous as reply was a
 string not a list (so the string was split into a list of set elements of
 characters). Removed lots of non-pep8 whitespace

---
 tests.py | 218 +++++++++++++++++++++++++++----------------------------
 ttp.py   | 103 +++++++++++++-------------
 2 files changed, 160 insertions(+), 161 deletions(-)
diff --git a/tests.py b/tests.py
index e084abc..4eeb30a 100644
--- a/tests.py
+++ b/tests.py
@@ -24,8 +24,8 @@
 class TWPTests(unittest.TestCase):
     def setUp(self):
         self.parser = ttp.Parser()
-    
-    
+
+
     # General Tests ------------------------------------------------------------
     # --------------------------------------------------------------------------
     def test_all_not_allow_amp_without_question(self):
@@ -33,516 +33,516 @@ def test_all_not_allow_amp_without_question(self):
         self.assertEqual(result.html, u'Check out: <a href="http://www.github.com/test">http://www.github.com/test</a>&<a href="http://twitter.com/username">@username</a>')
         self.assertEqual(result.users, [u'username'])
         self.assertEqual(result.urls, [u'http://www.github.com/test'])
-    
+
     def test_all_not_break_url_at(self):
         result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406')
         self.assertEqual(result.html, u'<a href="http://www.flickr.com/photos/29674651@N00/4382024406">http://www.flickr.com/photo...</a>')
         self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406'])
-    
-    
+
+
     # URL tests ----------------------------------------------------------------
     # --------------------------------------------------------------------------
     def test_url_mid(self):
         result = self.parser.parse(u'text http://example.com more text')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a> more text')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_unicode(self):
         result = self.parser.parse(u'I enjoy Macintosh Brand computers: http://✪df.ws/ejp')
         self.assertEqual(result.html, u'I enjoy Macintosh Brand computers: <a href="http://✪df.ws/ejp">http://✪df.ws/ejp</a>')
         self.assertEqual(result.urls, [u'http://\u272adf.ws/ejp'])
-    
+
     def test_url_parentheses(self):
         result = self.parser.parse(u'text (http://example.com)')
         self.assertEqual(result.html, u'text (<a href="http://example.com">http://example.com</a>)')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_underscore(self):
         result = self.parser.parse(u'text http://example.com/test/foo_123.jpg')
         self.assertEqual(result.html, u'text <a href="http://example.com/test/foo_123.jpg">http://example.com/test/foo...</a>')
         self.assertEqual(result.urls, [u'http://example.com/test/foo_123.jpg'])
-    
+
     def test_url_underscore_dot(self):
         result = self.parser.parse(u'text http://example.com/test/bla.net_foo_123.jpg')
         self.assertEqual(result.html, u'text <a href="http://example.com/test/bla.net_foo_123.jpg">http://example.com/test/bla...</a>')
         self.assertEqual(result.urls, [u'http://example.com/test/bla.net_foo_123.jpg'])
-    
+
     def test_url_amp_lang_equals(self):
         result = self.parser.parse(u'Check out http://search.twitter.com/search?q=avro&lang=en')
         self.assertEqual(result.html, u'Check out <a href="http://search.twitter.com/search?q=avro&amp;lang=en">http://search.twitter.com/s...</a>')
         self.assertEqual(result.urls, [u'http://search.twitter.com/search?q=avro&lang=en'])
-    
+
     def test_url_amp_break(self):
         result = self.parser.parse(u'Check out http://twitter.com/te?foo&invalid=True')
         self.assertEqual(result.html, u'Check out <a href="http://twitter.com/te?foo&amp;invalid=True">http://twitter.com/te?foo...</a>')
         self.assertEqual(result.urls, [u'http://twitter.com/te?foo&invalid=True'])
-    
+
     def test_url_dash(self):
         result = self.parser.parse(u'Is www.foo-bar.com a valid URL?')
         self.assertEqual(result.html, u'Is <a href="http://www.foo-bar.com">www.foo-bar.com</a> a valid URL?')
         self.assertEqual(result.urls, [u'www.foo-bar.com'])
-    
+
     def test_url_multiple(self):
         result = self.parser.parse(u'http://example.com https://sslexample.com http://sub.example.com')
         self.assertEqual(result.html, u'<a href="http://example.com">http://example.com</a> <a href="https://sslexample.com">https://sslexample.com</a> <a href="http://sub.example.com">http://sub.example.com</a>')
         self.assertEqual(result.urls, [u'http://example.com', u'https://sslexample.com', u'http://sub.example.com'])
-    
+
     def test_url_raw_domain(self):
         result = self.parser.parse(u'See http://example.com example.com')
         self.assertEqual(result.html, u'See <a href="http://example.com">http://example.com</a> example.com')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_embed_link(self):
         result = self.parser.parse(u'<link rel=\'true\'>http://example.com</link>')
         self.assertEqual(result.html, u'<link rel=\'true\'><a href="http://example.com">http://example.com</a></link>')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_trailing(self):
         result = self.parser.parse(u'text http://example.com')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_japanese(self):
         result = self.parser.parse(u'いまなにしてるhttp://example.comいまなにしてる')
         self.assertEqual(result.html, u'いまなにしてる<a href="http://example.com">http://example.com</a>いまなにしてる')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_lots_of_punctuation(self):
         result = self.parser.parse(u'text http://xo.com/~matthew+%-,.;x')
         self.assertEqual(result.html, u'text <a href="http://xo.com/~matthew+%-,.;x">http://xo.com/~matthew+%-,.;x</a>')
         self.assertEqual(result.urls, [u'http://xo.com/~matthew+%-,.;x'])
-    
+
     def test_url_question_numbers(self):
         result = self.parser.parse(u'text http://example.com/?77e8fd')
         self.assertEqual(result.html, u'text <a href="http://example.com/?77e8fd">http://example.com/?77e8fd</a>')
         self.assertEqual(result.urls, [u'http://example.com/?77e8fd'])
-    
+
     def test_url_one_letter_other(self):
         result = self.parser.parse(u'text http://u.nu/')
         self.assertEqual(result.html, u'text <a href="http://u.nu/">http://u.nu/</a>')
         self.assertEqual(result.urls, [u'http://u.nu/'])
-        
+
         result = self.parser.parse(u'text http://u.tv/')
         self.assertEqual(result.html, u'text <a href="http://u.tv/">http://u.tv/</a>')
         self.assertEqual(result.urls, [u'http://u.tv/'])
-    
+
     def test_url_one_letter_iana(self):
         result = self.parser.parse(u'text http://x.com/')
         self.assertEqual(result.html, u'text <a href="http://x.com/">http://x.com/</a>')
         self.assertEqual(result.urls, [u'http://x.com/'])
-        
+
         result = self.parser.parse(u'text http://Q.com/')
         self.assertEqual(result.html, u'text <a href="http://Q.com/">http://Q.com/</a>')
         self.assertEqual(result.urls, [u'http://Q.com/'])
-        
+
         result = self.parser.parse(u'text http://z.com/')
         self.assertEqual(result.html, u'text <a href="http://z.com/">http://z.com/</a>')
         self.assertEqual(result.urls, [u'http://z.com/'])
-        
+
         result = self.parser.parse(u'text http://i.net/')
         self.assertEqual(result.html, u'text <a href="http://i.net/">http://i.net/</a>')
         self.assertEqual(result.urls, [u'http://i.net/'])
-        
+
         result = self.parser.parse(u'text http://q.net/')
         self.assertEqual(result.html, u'text <a href="http://q.net/">http://q.net/</a>')
         self.assertEqual(result.urls, [u'http://q.net/'])
-        
+
         result = self.parser.parse(u'text http://X.org/')
         self.assertEqual(result.html, u'text <a href="http://X.org/">http://X.org/</a>')
         self.assertEqual(result.urls, [u'http://X.org/'])
-    
+
     def test_url_long_hypens(self):
         result = self.parser.parse(u'text http://word-and-a-number-8-ftw.domain.tld/')
         self.assertEqual(result.html, u'text <a href="http://word-and-a-number-8-ftw.domain.tld/">http://word-and-a-number-8-...</a>')
         self.assertEqual(result.urls, [u'http://word-and-a-number-8-ftw.domain.tld/'])
-    
-    
+
+
     # URL not tests ------------------------------------------------------------
     def test_not_url_dotdotdot(self):
         result = self.parser.parse(u'Is www...foo a valid URL?')
         self.assertEqual(result.html, u'Is www...foo a valid URL?')
         self.assertEqual(result.urls, [])
-    
+
     def test_not_url_dash(self):
         result = self.parser.parse(u'Is www.-foo.com a valid URL?')
         self.assertEqual(result.html, u'Is www.-foo.com a valid URL?')
         self.assertEqual(result.urls, [])
-    
+
     def test_not_url_no_tld(self):
         result = self.parser.parse(u'Is http://no-tld a valid URL?')
         self.assertEqual(result.html, u'Is http://no-tld a valid URL?')
         self.assertEqual(result.urls, [])
-    
+
     def test_not_url_tld_too_short(self):
         result = self.parser.parse(u'Is http://tld-too-short.x a valid URL?')
         self.assertEqual(result.html, u'Is http://tld-too-short.x a valid URL?')
         self.assertEqual(result.urls, [])
-    
+
     def test_all_not_break_url_at(self):
         result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406')
         self.assertEqual(result.html, u'<a href="http://www.flickr.com/photos/29674651@N00/4382024406">http://www.flickr.com/photo...</a>')
         self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406'])
-    
+
     def test_not_url_one_letter_iana(self):
         result = self.parser.parse(u'text http://a.com/ http://a.net/ http://a.org/')
         self.assertEqual(result.html, u'text http://a.com/ http://a.net/ http://a.org/')
         self.assertEqual(result.urls, [])
-    
-    
+
+
     # URL followed Tests -------------------------------------------------------
     def test_url_followed_question(self):
         result = self.parser.parse(u'text http://example.com?')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>?')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_followed_colon(self):
         result = self.parser.parse(u'text http://example.com:')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>:')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_followed_curly_brace(self):
         result = self.parser.parse(u'text http://example.com}')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>}')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_followed_single_quote(self):
         result = self.parser.parse(u'text http://example.com')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_followed_dot(self):
         result = self.parser.parse(u'text http://example.com.')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>.')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_followed_exclamation(self):
         result = self.parser.parse(u'text http://example.com!')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>!')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_followed_comma(self):
         result = self.parser.parse(u'text http://example.com,')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>,')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_followed_brace(self):
         result = self.parser.parse(u'text http://example.com)')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>)')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_followed_big_brace(self):
         result = self.parser.parse(u'text http://example.com]')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>]')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_followed_equals(self):
         result = self.parser.parse(u'text http://example.com=')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>=')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_followed_semicolon(self):
         result = self.parser.parse(u'text http://example.com;')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>;')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_url_followed_hypen(self):
         result = self.parser.parse(u'text http://domain.tld-that-you-should-have-put-a-space-after')
         self.assertEqual(result.html, u'text <a href="http://domain.tld">http://domain.tld</a>-that-you-should-have-put-a-space-after')
         self.assertEqual(result.urls, [u'http://domain.tld'])
-    
-    
+
+
     # URL preceeded Tests -------------------------------------------------------
     def test_url_preceeded_colon(self):
         result = self.parser.parse(u'text:http://example.com')
         self.assertEqual(result.html, u'text:<a href="http://example.com">http://example.com</a>')
         self.assertEqual(result.urls, [u'http://example.com'])
-    
+
     def test_not_url_preceeded_equals(self):
         result = self.parser.parse(u'text =http://example.com')
         self.assertEqual(result.html, u'text =http://example.com')
         self.assertEqual(result.urls, [])
-    
+
     # NOT
     def test_not_url_preceeded_forwardslash(self):
         result = self.parser.parse(u'text /http://example.com')
         self.assertEqual(result.html, u'text /http://example.com')
         self.assertEqual(result.urls, [])
-    
+
     def test_not_url_preceeded_exclamation(self):
         result = self.parser.parse(u'text !http://example.com')
         self.assertEqual(result.html, u'text !http://example.com')
         self.assertEqual(result.urls, [])
-    
-    
+
+
     # URL numeric tests --------------------------------------------------------
     def test_url_at_numeric(self):
         result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406')
         self.assertEqual(result.html, u'<a href="http://www.flickr.com/photos/29674651@N00/4382024406">http://www.flickr.com/photo...</a>')
         self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406'])
-    
+
     def test_url_at_non_numeric(self):
         result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/foobar')
         self.assertEqual(result.html, u'<a href="http://www.flickr.com/photos/29674651@N00/foobar">http://www.flickr.com/photo...</a>')
         self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/foobar'])
-    
-    
+
+
     # URL domain tests ---------------------------------------------------------
     def test_url_WWW(self):
         result = self.parser.parse(u'WWW.EXAMPLE.COM')
         self.assertEqual(result.html, u'<a href="http://WWW.EXAMPLE.COM">WWW.EXAMPLE.COM</a>')
         self.assertEqual(result.urls, [u'WWW.EXAMPLE.COM'])
-    
+
     def test_url_www(self):
         result = self.parser.parse(u'www.example.com')
         self.assertEqual(result.html, u'<a href="http://www.example.com">www.example.com</a>')
         self.assertEqual(result.urls, [u'www.example.com'])
-    
+
     def test_url_only_domain_query_followed_period(self):
         result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.i.want.it. Even when they contain a URL.')
         self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period <a href="http://tell.me/why?=because.i.want.it">http://tell.me/why?=because...</a>. Even when they contain a URL.')
         self.assertEqual(result.urls, [u'http://tell.me/why?=because.i.want.it'])
-    
+
     def test_url_only_domain_followed_period(self):
         result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me. Even when they contain a URL.')
         self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period <a href="http://tell.me">http://tell.me</a>. Even when they contain a URL.')
         self.assertEqual(result.urls, [u'http://tell.me'])
-    
+
     def test_url_only_domain_path_followed_period(self):
         result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why. Even when they contain a URL.')
         self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period <a href="http://tell.me/why">http://tell.me/why</a>. Even when they contain a URL.')
         self.assertEqual(result.urls, [u'http://tell.me/why'])
-    
+
     def test_url_long_tld(self):
         result = self.parser.parse(u'http://example.mobi/path')
         self.assertEqual(result.html, u'<a href="http://example.mobi/path">http://example.mobi/path</a>')
         self.assertEqual(result.urls, [u'http://example.mobi/path'])
-    
+
     def test_url_multiple_protocols(self):
         result = self.parser.parse(u'http://foo.com AND https://bar.com AND www.foobar.com')
         self.assertEqual(result.html, u'<a href="http://foo.com">http://foo.com</a> AND <a href="https://bar.com">https://bar.com</a> AND <a href="http://www.foobar.com">www.foobar.com</a>')
         self.assertEqual(result.urls, [u'http://foo.com', u'https://bar.com', u'www.foobar.com'])
-    
+
     # NOT
     def test_not_url_exclamation_domain(self):
         result = self.parser.parse(u'badly formatted http://foo!bar.com')
         self.assertEqual(result.html, u'badly formatted http://foo!bar.com')
         self.assertEqual(result.urls, [])
-    
+
     def test_not_url_under_domain(self):
         result = self.parser.parse(u'badly formatted http://foo_bar.com')
         self.assertEqual(result.html, u'badly formatted http://foo_bar.com')
         self.assertEqual(result.urls, [])
-    
-    
+
+
     # Hashtag tests ------------------------------------------------------------
     # --------------------------------------------------------------------------
     def test_hashtag_followed_full_whitespace(self):
         result = self.parser.parse(u'#hashtag　text')
         self.assertEqual(result.html, u'<a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>　text')
         self.assertEqual(result.tags, [u'hashtag'])
-    
+
     def test_hashtag_followed_full_hash(self):
         result = self.parser.parse(u'＃hashtag')
         self.assertEqual(result.html, u'<a href="http://search.twitter.com/search?q=%23hashtag">＃hashtag</a>')
         self.assertEqual(result.tags, [u'hashtag'])
-    
+
     def test_hashtag_preceeded_full_whitespace(self):
         result = self.parser.parse(u'text　#hashtag')
         self.assertEqual(result.html, u'text　<a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>')
         self.assertEqual(result.tags, [u'hashtag'])
-    
+
     def test_hashtag_number(self):
         result = self.parser.parse(u'text #1tag')
         self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%231tag">#1tag</a>')
         self.assertEqual(result.tags, [u'1tag'])
-    
+
     def test_not_hashtag_escape(self):
         result = self.parser.parse(u'&#nbsp;')
         self.assertEqual(result.html, u'&#nbsp;')
         self.assertEqual(result.tags, [])
-    
+
     def test_hashtag_japanese(self):
         result = self.parser.parse(u'text #hashtagの')
         self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>の')
         self.assertEqual(result.tags, [u'hashtag'])
-    
+
     def test_hashtag_period(self):
         result = self.parser.parse(u'text.#hashtag')
         self.assertEqual(result.html, u'text.<a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>')
         self.assertEqual(result.tags, [u'hashtag'])
-    
+
     def test_hashtag_trailing(self):
         result = self.parser.parse(u'text #hashtag')
         self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>')
         self.assertEqual(result.tags, [u'hashtag'])
-    
+
     def test_not_hashtag_exclamation(self):
         result = self.parser.parse(u'text #hashtag!')
         self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hashtag">#hashtag</a>!')
         self.assertEqual(result.tags, [u'hashtag'])
-    
+
     def test_hashtag_multiple(self):
         result = self.parser.parse(u'text #hashtag1 #hashtag2')
         self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hashtag1">#hashtag1</a> <a href="http://search.twitter.com/search?q=%23hashtag2">#hashtag2</a>')
         self.assertEqual(result.tags, [u'hashtag1', u'hashtag2'])
-    
+
     def test_not_hashtag_number(self):
         result = self.parser.parse(u'text #1234')
         self.assertEqual(result.html, u'text #1234')
         self.assertEqual(result.tags, [])
-    
+
     def test_not_hashtag_text(self):
         result = self.parser.parse(u'text#hashtag')
         self.assertEqual(result.html, u'text#hashtag')
         self.assertEqual(result.tags, [])
-    
+
     def test_hashtag_umlaut(self):
         result = self.parser.parse(u'text #hash_tagüäö')
         self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hash_tag%C3%BC%C3%A4%C3%B6">#hash_tagüäö</a>')
         self.assertEqual(result.tags, [u'hash_tag\xfc\xe4\xf6'])
-    
+
     def test_hashtag_alpha(self):
         result = self.parser.parse(u'text #hash0tag')
         self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hash0tag">#hash0tag</a>')
         self.assertEqual(result.tags, [u'hash0tag'])
-    
+
     def test_hashtag_under(self):
         result = self.parser.parse(u'text #hash_tag')
         self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hash_tag">#hash_tag</a>')
         self.assertEqual(result.tags, [u'hash_tag'])
-    
-    
+
+
     # Username tests -----------------------------------------------------------
     # --------------------------------------------------------------------------
     def test_not_username_preceded_letter(self):
         result = self.parser.parse(u'meet@the beach')
         self.assertEqual(result.html, u'meet@the beach')
         self.assertEqual(result.users, [])
-    
+
     def test_username_preceded_punctuation(self):
         result = self.parser.parse(u'.@username')
         self.assertEqual(result.html, u'.<a href="http://twitter.com/username">@username</a>')
         self.assertEqual(result.users, [u'username'])
-    
+
     def test_username_preceded_japanese(self):
         result = self.parser.parse(u'あ@username')
         self.assertEqual(result.html, u'あ<a href="http://twitter.com/username">@username</a>')
         self.assertEqual(result.users, [u'username'])
-    
+
     def test_username_followed_japanese(self):
         result = self.parser.parse(u'@usernameの')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username">@username</a>の')
         self.assertEqual(result.users, [u'username'])
-    
+
     def test_username_surrounded_japanese(self):
         result = self.parser.parse(u'あ@usernameの')
         self.assertEqual(result.html, u'あ<a href="http://twitter.com/username">@username</a>の')
         self.assertEqual(result.users, [u'username'])
-    
+
     def test_username_followed_punctuation(self):
         result = self.parser.parse(u'@username&^$%^')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username">@username</a>&^$%^')
         self.assertEqual(result.users, [u'username'])
-    
+
     def test_not_username_spaced(self):
         result = self.parser.parse(u'@ username')
         self.assertEqual(result.html, u'@ username')
         self.assertEqual(result.users, [])
-    
+
     def test_username_beginning(self):
         result = self.parser.parse(u'@username text')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username">@username</a> text')
         self.assertEqual(result.users, [u'username'])
-    
+
     def test_username_to_long(self):
         result = self.parser.parse(u'@username9012345678901')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username901234567890">@username901234567890</a>1')
         self.assertEqual(result.users, [u'username901234567890'])
-    
+
     def test_username_full_at_sign(self):
         result = self.parser.parse(u'＠username')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username">＠username</a>')
         self.assertEqual(result.users, [u'username'])
-    
+
     def test_username_trailing(self):
         result = self.parser.parse(u'text @username')
         self.assertEqual(result.html, u'text <a href="http://twitter.com/username">@username</a>')
         self.assertEqual(result.users, [u'username'])
-    
+
     # Replies
     def test_username_reply_simple(self):
         result = self.parser.parse(u'@username')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username">@username</a>')
         self.assertEqual(result.users, [u'username'])
         self.assertEqual(result.reply, u'username')
-    
+
     def test_username_reply_whitespace(self):
         result = self.parser.parse(u'   @username')
         self.assertEqual(result.html, u'   <a href="http://twitter.com/username">@username</a>')
         self.assertEqual(result.users, [u'username'])
         self.assertEqual(result.reply, u'username')
-    
+
     def test_username_reply_full(self):
         result = self.parser.parse(u'　@username')
         self.assertEqual(result.html, u'　<a href="http://twitter.com/username">@username</a>')
         self.assertEqual(result.users, [u'username'])
         self.assertEqual(result.reply, u'username')
-    
+
     def test_username_non_reply(self):
         result = self.parser.parse(u'test @username')
         self.assertEqual(result.html, u'test <a href="http://twitter.com/username">@username</a>')
         self.assertEqual(result.users, [u'username'])
         self.assertEqual(result.reply, None)
-    
-    
+
+
     # List tests ---------------------------------------------------------------
     # --------------------------------------------------------------------------
     def test_list_preceeded(self):
         result = self.parser.parse(u'text @username/list')
         self.assertEqual(result.html, u'text <a href="http://twitter.com/username/list">@username/list</a>')
         self.assertEqual(result.lists, [(u'username', u'list')])
-    
+
     def test_list_beginning(self):
         result = self.parser.parse(u'@username/list')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username/list">@username/list</a>')
         self.assertEqual(result.lists, [(u'username', u'list')])
-    
+
     def test_list_preceeded_punctuation(self):
         result = self.parser.parse(u'.@username/list')
         self.assertEqual(result.html, u'.<a href="http://twitter.com/username/list">@username/list</a>')
         self.assertEqual(result.lists, [(u'username', u'list')])
-    
+
     def test_list_followed_punctuation(self):
         result = self.parser.parse(u'@username/list&^$%^')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username/list">@username/list</a>&^$%^')
         self.assertEqual(result.lists, [(u'username', u'list')])
-    
+
     def test_list_not_slash_space(self):
         result = self.parser.parse(u'@username/ list')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username">@username</a>/ list')
         self.assertEqual(result.users, [u'username'])
         self.assertEqual(result.lists, [])
-    
+
     def test_list_beginning(self):
         result = self.parser.parse(u'@username/list')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username/list">@username/list</a>')
         self.assertEqual(result.lists, [(u'username', u'list')])
-    
+
     def test_list_not_empty_username(self):
         result = self.parser.parse(u'text @/list')
         self.assertEqual(result.html, u'text @/list')
         self.assertEqual(result.lists, [])
-    
+
     def test_list_not_preceeded_letter(self):
         result = self.parser.parse(u'meet@the/beach')
         self.assertEqual(result.html, u'meet@the/beach')
         self.assertEqual(result.lists, [])
-    
+
     def test_list_long_truncate(self):
         result = self.parser.parse(u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username/list5678901234567890123456789012345678901234567890123456789012345678901234567890">@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890</a>A')
         self.assertEqual(result.lists, [(u'username', u'list5678901234567890123456789012345678901234567890123456789012345678901234567890')])
-    
+
     def test_list_with_dash(self):
         result = self.parser.parse(u'text @username/list-foo')
         self.assertEqual(result.html, u'text <a href="http://twitter.com/username/list-foo">@username/list-foo</a>')
diff --git a/ttp.py b/ttp.py
index 27102a9..c19b787 100644
--- a/ttp.py
+++ b/ttp.py
@@ -60,71 +60,70 @@
                           PATH_ENDING_CHARS, QUERY_CHARS, QUERY_ENDING_CHARS),
                           re.IGNORECASE)
 
-
 # Registered IANA one letter domains
 IANA_ONE_LETTER_DOMAINS = ('x.com', 'x.org', 'z.com', 'q.net', 'q.com', 'i.net')
 
 
 class ParseResult(object):
     '''A class containing the results of a parsed Tweet.
-    
+
     Attributes:
     - urls:
         A list containing all the valid urls in the Tweet.
-    
+
     - users
         A list containing all the valid usernames in the Tweet.
-    
+
     - reply
         A string containing the username this tweet was a reply to.
         This only matches a username at the beginning of the Tweet,
         it may however be preceeded by whitespace.
         Note: It's generally better to rely on the Tweet JSON/XML in order to
         find out if it's a reply or not.
-        
+
     - lists
         A list containing all the valid lists in the Tweet.
         Each list item is a tuple in the format (username, listname).
-        
+
     - tags
         A list containing all the valid tags in theTweet.
-    
+
     - html
         A string containg formatted HTML.
         To change the formatting sublcass twp.Parser and override the format_*
         methods.
-    
+
     '''
-    
+
     def __init__(self, urls, users, reply, lists, tags, html):
-        self.urls = list(set(urls)) if urls else []  #fixes dups
-        self.users = list(set(users)) if users else []
-        self.lists = list(set(lists)) if lists else []
-        self.reply = list(set(reply)) if reply else []
-        self.tags = list(set(tags)) if tags else []
+        self.urls = urls if urls else []
+        self.users = users if users else []
+        self.lists = lists if lists else []
+        self.reply = reply if reply else None
+        self.tags = tags if tags else []
         self.html = html
 
 
 class Parser(object):
     '''A Tweet Parser'''
-    
+
     def __init__(self, max_url_length=30):
         self._max_url_length = max_url_length
-    
+
     def parse(self, text, html=True):
         '''Parse the text and return a ParseResult instance.'''
         self._urls = []
         self._users = []
         self._lists = []
         self._tags = []
-        
+
         reply = REPLY_REGEX.match(text)
         reply = reply.groups(0)[0] if reply is not None else None
-        
+
         parsed_html = self._html(text) if html else self._text(text)
         return ParseResult(self._urls, self._users, reply,
                            self._lists, self._tags, parsed_html)
-    
+
     def _text(self, text):
         '''Parse a Tweet without generating HTML.'''
         URL_REGEX.sub(self._parse_urls, text)
@@ -132,84 +131,84 @@ def _text(self, text):
         LIST_REGEX.sub(self._parse_lists, text)
         HASHTAG_REGEX.sub(self._parse_tags, text)
         return None
-    
+
     def _html(self, text):
         '''Parse a Tweet and generate HTML.'''
         html = URL_REGEX.sub(self._parse_urls, text)
         html = USERNAME_REGEX.sub(self._parse_users, html)
         html = LIST_REGEX.sub(self._parse_lists, html)
         return HASHTAG_REGEX.sub(self._parse_tags, html)
-    
-    
+
+
     # Internal parser stuff ----------------------------------------------------
     def _parse_urls(self, match):
         '''Parse URLs.'''
-        
+
         mat = match.group(0)
-        
+
         # Fix a bug in the regex concerning www...com and www.-foo.com domains
         # TODO fix this in the regex instead of working around it here
         domain = match.group(5)
         if domain[0] in '.-':
             return mat
-        
+
         # Only allow IANA one letter domains that are actually registered
         if len(domain) == 5 \
            and domain[-4:].lower() in ('.com', '.org', '.net') \
            and not domain.lower() in IANA_ONE_LETTER_DOMAINS:
-            
+
             return mat
-        
+
         # Check for urls without http(s)
         pos = mat.find('http')
         if pos != -1:
             pre, url = mat[:pos], mat[pos:]
             full_url = url
-        
+
         # Find the www and force http://
         else:
             pos = mat.lower().find('www')
             pre, url = mat[:pos], mat[pos:]
             full_url = 'http://%s' % url
-        
+
         self._urls.append(url)
-        
+
         if self._html:
             return '%s%s' % (pre, self.format_url(full_url,
                                        self._shorten_url(escape(url))))
-    
+
     def _parse_users(self, match):
         '''Parse usernames.'''
-        
+
         # Don't parse lists here
         if match.group(2) is not None:
             return match.group(0)
-        
+
         mat = match.group(0)
         self._users.append(mat[1:])
-        
+
         if self._html:
             return self.format_username(mat[0:1], mat[1:])
-    
+
     def _parse_lists(self, match):
         '''Parse lists.'''
-        
+
         # Don't parse usernames here
         if match.group(4) is None:
             return match.group(0)
-        
+
         pre, at_char, user, list_name = match.groups()
         list_name = list_name[1:]
         self._lists.append((user, list_name))
-        
+
         if self._html:
             return '%s%s' % (pre, self.format_list(at_char, user, list_name))
-    
+
     def _parse_tags(self, match):
         '''Parse hashtags.'''
-        
+
         mat = match.group(0)
-        
+
         # Fix problems with the regex capturing stuff infront of the #
         tag = None
         for i in u'#\uff03':
@@ -217,45 +216,45 @@ def _parse_tags(self, match):
             if pos != -1:
                 tag = i
                 break
-        
+
         pre, text = mat[:pos], mat[pos + 1:]
         self._tags.append(text)
-        
+
         if self._html:
             return '%s%s' % (pre, self.format_tag(tag, text))
-    
+
     def _shorten_url(self, text):
         '''Shorten a URL and make sure to not cut of html entities.'''
-        
+
         if len(text) > self._max_url_length and self._max_url_length != -1:
             text = text[0:self._max_url_length - 3]
             amp = text.rfind('&')
             close = text.rfind(';')
             if amp != -1 and (close == -1 or close < amp):
                 text = text[0:amp]
-            
+
             return text + '...'
-        
+
         else:
             return text
-    
-    
+
+
     # User defined formatters --------------------------------------------------
     def format_tag(self, tag, text):
         '''Return formatted HTML for a hashtag.'''
         return '<a href="http://search.twitter.com/search?q=%s">%s%s</a>' \
                 % (urllib.quote('#' + text.encode('utf-8')), tag, text)
-    
+
     def format_username(self, at_char, user):
         '''Return formatted HTML for a username.'''
         return '<a href="http://twitter.com/%s">%s%s</a>' \
                % (user, at_char, user)
-    
+
     def format_list(self, at_char, user, list_name):
         '''Return formatted HTML for a list.'''
         return '<a href="http://twitter.com/%s/%s">%s%s/%s</a>' \
                % (user, list_name, at_char, user, list_name)
-    
+
     def format_url(self, url, text):
         '''Return formatted HTML for a url.'''
         return '<a href="%s">%s</a>' % (escape(url), text)

From 71b793a35a6bdbb903a2b06c0e41975e69b87cc5 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Fri, 7 Sep 2012 15:05:52 -0700
Subject: [PATCH 02/38] Applied schwa's span addition
 https://github.com/schwa/twitter-text-python/commit/b81cef33a6fc12c837936d60a0b4a86222d45a4f
 to add option to extract span for matched parts of message for URLs, users
 etc

---
 ttp.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/ttp.py b/ttp.py
index c19b787..98a8844 100644
--- a/ttp.py
+++ b/ttp.py
@@ -107,8 +107,9 @@ def __init__(self, urls, users, reply, lists, tags, html):
 class Parser(object):
     '''A Tweet Parser'''
 
-    def __init__(self, max_url_length=30):
+    def __init__(self, max_url_length=30, include_spans = False):
         self._max_url_length = max_url_length
+        self._include_spans = include_spans
 
     def parse(self, text, html=True):
         '''Parse the text and return a ParseResult instance.'''
@@ -171,7 +172,10 @@ def _parse_urls(self, match):
             pre, url = mat[:pos], mat[pos:]
             full_url = 'http://%s' % url
 
-        self._urls.append(url)
+        if self._include_spans:
+            self._urls.append((url, match.span(0)))
+        else:
+            self._urls.append(url)
 
         if self._html:
             return '%s%s' % (pre, self.format_url(full_url,
@@ -185,7 +189,10 @@ def _parse_users(self, match):
             return match.group(0)
 
         mat = match.group(0)
-        self._users.append(mat[1:])
+        if self._include_spans:
+            self._users.append((mat[1:], match.span(0)))
+        else:
+            self._users.append(mat[1:])
 
         if self._html:
             return self.format_username(mat[0:1], mat[1:])
@@ -199,7 +206,10 @@ def _parse_lists(self, match):
 
         pre, at_char, user, list_name = match.groups()
         list_name = list_name[1:]
-        self._lists.append((user, list_name))
+        if self._include_spans:
+            self._lists.append((user, list_name, match.span(0)))
+        else:
+            self._lists.append((user, list_name))
 
         if self._html:
             return '%s%s' % (pre, self.format_list(at_char, user, list_name))
@@ -218,7 +228,10 @@ def _parse_tags(self, match):
                 break
 
         pre, text = mat[:pos], mat[pos + 1:]
-        self._tags.append(text)
+        if self._include_spans:
+            self._tags.append((text, match.span(0)))
+        else:
+            self._tags.append(text)
 
         if self._html:
             return '%s%s' % (pre, self.format_tag(tag, text))

From ff5a0c024ff2c4ced47f14fee007247913cc1888 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Fri, 7 Sep 2012 16:00:08 -0700
Subject: [PATCH 03/38] added span tests as a separate class

---
 tests.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/tests.py b/tests.py
index 4eeb30a..c409443 100644
--- a/tests.py
+++ b/tests.py
@@ -549,7 +549,33 @@ def test_list_with_dash(self):
         self.assertEqual(result.lists, [(u'username', u'list-foo')])
 
 
+class TWPTestsWithSpans(unittest.TestCase):
+    """Test ttp with re spans to extract character co-ords of matches"""
+    def setUp(self):
+        self.parser = ttp.Parser(include_spans = True)
+
+    def test_spans_in_tweets(self):
+        """Test some coca-cola tweets taken from twitter with spans"""
+        result = self.parser.parse(u'Coca-Cola Hits 50 Million Facebook Likes http://bit.ly/QlKOc7')
+        self.assertEqual(result.urls, [('http://bit.ly/QlKOc7', (40, 61))])
+
+        result = self.parser.parse(u' #ABillionReasonsToBelieveInAfrica ARISE MAG.FASHION WEEK NY! Tsemaye B,Maki Oh,Tiffany Amber, Ozwald.Showin NY reasons2beliv @CocaCola_NG')
+        self.assertEqual(result.urls, [])
+        self.assertEqual(result.tags, [(u'ABillionReasonsToBelieveInAfrica', (0, 34))])
+        self.assertEqual(result.users, [(u'CocaCola_NG', (126, 138))])
+
+        result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA')
+        self.assertEqual(result.urls, [(u'http://bit.ly/EANCAA', (94, 115))])
+        self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72))])
+        self.assertEqual(result.tags, [(u'GameOn', (207, 215)), (u'ad', (215, 219))])
+
+
 # Test it!
 if __name__ == '__main__':
-    unittest.main()
+    #unittest.main() # only seems to run 1 class?
 
+    verbosity = 0 # set to 2 for verbose output
+    suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans)
+    unittest.TextTestRunner(verbosity=verbosity).run(suite)
+    suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests)
+    unittest.TextTestRunner(verbosity=verbosity).run(suite)

From a202185a6cf45a6e07b0b7eacf00e4b1da0dc19e Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Fri, 7 Sep 2012 16:03:01 -0700
Subject: [PATCH 04/38] not sure what happened, unittest.main() does the job
 now

---
 tests.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests.py b/tests.py
index c409443..4c62778 100644
--- a/tests.py
+++ b/tests.py
@@ -572,10 +572,10 @@ def test_spans_in_tweets(self):
 
 # Test it!
 if __name__ == '__main__':
-    #unittest.main() # only seems to run 1 class?
+    unittest.main() # only seems to run 1 class?
 
-    verbosity = 0 # set to 2 for verbose output
-    suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans)
-    unittest.TextTestRunner(verbosity=verbosity).run(suite)
-    suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests)
-    unittest.TextTestRunner(verbosity=verbosity).run(suite)
+    #verbosity = 0 # set to 2 for verbose output
+    #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans)
+    #unittest.TextTestRunner(verbosity=verbosity).run(suite)
+    #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests)
+    #unittest.TextTestRunner(verbosity=verbosity).run(suite)

From 90fbc84d244a5445b005d679280077d183f05e5f Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Sun, 9 Sep 2012 16:14:44 -0700
Subject: [PATCH 05/38] added test for hash and comma in URL

---
 tests.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests.py b/tests.py
index 4c62778..3ba1c90 100644
--- a/tests.py
+++ b/tests.py
@@ -28,6 +28,12 @@ def setUp(self):
 
     # General Tests ------------------------------------------------------------
     # --------------------------------------------------------------------------
+    def test_urls(self):
+        """Confirm that # in a URL works along with ,"""
+        result = self.parser.parse(u'big url: http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag')
+        self.assertEqual(result.urls, [u'http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2'])
+        self.assertEqual(result.tags, [u'ahashtag'])
+
     def test_all_not_allow_amp_without_question(self):
         result = self.parser.parse(u'Check out: http://www.github.com/test&@username')
         self.assertEqual(result.html, u'Check out: <a href="http://www.github.com/test">http://www.github.com/test</a>&<a href="http://twitter.com/username">@username</a>')

From b25880ab09f21927b177ab8af8560cdf3b1a9474 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Sun, 9 Sep 2012 23:55:43 -0700
Subject: [PATCH 06/38] uncovered two name-shielded tests and renamed, now also
 using non-html text for the span=True tests

---
 tests.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests.py b/tests.py
index 3ba1c90..c839b53 100644
--- a/tests.py
+++ b/tests.py
@@ -184,7 +184,7 @@ def test_not_url_tld_too_short(self):
         self.assertEqual(result.html, u'Is http://tld-too-short.x a valid URL?')
         self.assertEqual(result.urls, [])
 
-    def test_all_not_break_url_at(self):
+    def test_all_not_break_url_at2(self):
         result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406')
         self.assertEqual(result.html, u'<a href="http://www.flickr.com/photos/29674651@N00/4382024406">http://www.flickr.com/photo...</a>')
         self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406'])
@@ -529,7 +529,7 @@ def test_list_not_slash_space(self):
         self.assertEqual(result.users, [u'username'])
         self.assertEqual(result.lists, [])
 
-    def test_list_beginning(self):
+    def test_list_beginning2(self):
         result = self.parser.parse(u'@username/list')
         self.assertEqual(result.html, u'<a href="http://twitter.com/username/list">@username/list</a>')
         self.assertEqual(result.lists, [(u'username', u'list')])
@@ -565,16 +565,19 @@ def test_spans_in_tweets(self):
         result = self.parser.parse(u'Coca-Cola Hits 50 Million Facebook Likes http://bit.ly/QlKOc7')
         self.assertEqual(result.urls, [('http://bit.ly/QlKOc7', (40, 61))])
 
-        result = self.parser.parse(u' #ABillionReasonsToBelieveInAfrica ARISE MAG.FASHION WEEK NY! Tsemaye B,Maki Oh,Tiffany Amber, Ozwald.Showin NY reasons2beliv @CocaCola_NG')
+        result = self.parser.parse(u' #ABillionReasonsToBelieveInAfrica ARISE MAG.FASHION WEEK NY! Tsemaye B,Maki Oh,Tiffany Amber, Ozwald.Showin NY reasons2beliv @CocaCola_NG', html=False)
         self.assertEqual(result.urls, [])
         self.assertEqual(result.tags, [(u'ABillionReasonsToBelieveInAfrica', (0, 34))])
         self.assertEqual(result.users, [(u'CocaCola_NG', (126, 138))])
 
-        result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA')
+        result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA', html=False)
         self.assertEqual(result.urls, [(u'http://bit.ly/EANCAA', (94, 115))])
         self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72))])
-        self.assertEqual(result.tags, [(u'GameOn', (207, 215)), (u'ad', (215, 219))])
+        self.assertEqual(result.tags, [(u'GameOn', (75, 83)), (u'ad', (83, 87))])
 
+    def test_users_in_tweets(self):
+        result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA @someone', html=False)
+        self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72)), (u'someone', (116, 124))])
 
 # Test it!
 if __name__ == '__main__':

From 536ba80fdd50815adc9487a88aa416bd447ae9d9 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 10 Sep 2012 00:39:50 -0700
Subject: [PATCH 07/38] removed off-by-one offset for URL and hashtag matcher
 if a pre character e.g. space exists

---
 tests.py | 25 ++++++++++++++++++++-----
 ttp.py   | 10 ++++++++--
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/tests.py b/tests.py
index c839b53..26fb373 100644
--- a/tests.py
+++ b/tests.py
@@ -563,27 +563,42 @@ def setUp(self):
     def test_spans_in_tweets(self):
         """Test some coca-cola tweets taken from twitter with spans"""
         result = self.parser.parse(u'Coca-Cola Hits 50 Million Facebook Likes http://bit.ly/QlKOc7')
-        self.assertEqual(result.urls, [('http://bit.ly/QlKOc7', (40, 61))])
+        self.assertEqual(result.urls, [('http://bit.ly/QlKOc7', (41, 61))])
 
         result = self.parser.parse(u' #ABillionReasonsToBelieveInAfrica ARISE MAG.FASHION WEEK NY! Tsemaye B,Maki Oh,Tiffany Amber, Ozwald.Showin NY reasons2beliv @CocaCola_NG', html=False)
         self.assertEqual(result.urls, [])
-        self.assertEqual(result.tags, [(u'ABillionReasonsToBelieveInAfrica', (0, 34))])
+        self.assertEqual(result.tags, [(u'ABillionReasonsToBelieveInAfrica', (1, 34))])
         self.assertEqual(result.users, [(u'CocaCola_NG', (126, 138))])
 
         result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA', html=False)
-        self.assertEqual(result.urls, [(u'http://bit.ly/EANCAA', (94, 115))])
+        self.assertEqual(result.urls, [(u'http://bit.ly/EANCAA', (95, 115))])
         self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72))])
-        self.assertEqual(result.tags, [(u'GameOn', (75, 83)), (u'ad', (83, 87))])
+        self.assertEqual(result.tags, [(u'GameOn', (76, 83)), (u'ad', (84, 87))])
 
     def test_users_in_tweets(self):
         result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA @someone', html=False)
         self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72)), (u'someone', (116, 124))])
 
+    def test_edge_cases(self):
+        """Some edge cases that upset the original version of ttp"""
+        result = self.parser.parse(u' @user', html=False)
+        self.assertEqual(result.users, [(u'user', (1, 6))])
+
+        result = self.parser.parse(u' #hash ', html=False)
+        self.assertEqual(result.tags, [(u'hash', (1, 6))])
+
+        result = self.parser.parse(u' http://some.com ', html=False)
+        self.assertEqual(result.urls, [(u'http://some.com', (1, 16))])
+
+
+
 # Test it!
 if __name__ == '__main__':
-    unittest.main() # only seems to run 1 class?
+    unittest.main()
 
     #verbosity = 0 # set to 2 for verbose output
+    #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpansEdgeCases)
+    #unittest.TextTestRunner(verbosity=verbosity).run(suite)
     #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans)
     #unittest.TextTestRunner(verbosity=verbosity).run(suite)
     #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests)
diff --git a/ttp.py b/ttp.py
index 98a8844..b4552b3 100644
--- a/ttp.py
+++ b/ttp.py
@@ -173,7 +173,10 @@ def _parse_urls(self, match):
             full_url = 'http://%s' % url
 
         if self._include_spans:
-            self._urls.append((url, match.span(0)))
+            span = match.span(0)
+            # add an offset if pre is e.g. ' '
+            span = (span[0] + len(pre), span[1])
+            self._urls.append((url, span))
         else:
             self._urls.append(url)
 
@@ -229,7 +232,10 @@ def _parse_tags(self, match):
 
         pre, text = mat[:pos], mat[pos + 1:]
         if self._include_spans:
-            self._tags.append((text, match.span(0)))
+            span = match.span(0)
+            # add an offset if pre is e.g. ' '
+            span = (span[0] + len(pre), span[1])
+            self._tags.append((text, span))
         else:
             self._tags.append(text)
 

From a8c77dcbc1e04429f9159c6080b78f372e5845fd Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Wed, 12 Sep 2012 21:25:37 -0700
Subject: [PATCH 08/38] added reference to the original project

---
 README.rst | 6 ++++++
 ttp.py     | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/README.rst b/README.rst
index 207d3a9..06410ff 100644
--- a/README.rst
+++ b/README.rst
@@ -9,6 +9,12 @@ twitter-text-conformance_ plus some additional ones.
 .. _twitter-text-java: http://github.com/mzsanford/twitter-text-java
 .. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance
 
+UPDATE - forked by Ian Ozsvald, some bugs fixed, few minor changes to functionality added:
+https://github.com/ianozsvald/twitter-text-python
+
+The original ttp comes from:
+https://github.com/BonsaiDen/twitter-text-python
+
 Usage::
 
     >>> import ttp
diff --git a/ttp.py b/ttp.py
index b4552b3..8e68bf6 100644
--- a/ttp.py
+++ b/ttp.py
@@ -15,6 +15,10 @@
 
 # TODO create a setup.py
 
+# Forked by Ian Ozsvald:
+# https://github.com/ianozsvald/twitter-text-python
+# from:
+# https://github.com/BonsaiDen/twitter-text-python
 
 # Tweet Parser and Formatter ---------------------------------------------------
 # ------------------------------------------------------------------------------

From f3095689bd98f9969fe507b0a833c9e7156801cf Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Wed, 12 Sep 2012 21:27:53 -0700
Subject: [PATCH 09/38] changed URL

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 9de7d83..a2d99b1 100644
--- a/setup.py
+++ b/setup.py
@@ -5,9 +5,9 @@
     version='1.0',
     description='Tweet parser and formatter',
     long_description=open('README.rst').read(),
-    author='Ivo Wetzel',
+    author='Ivo Wetzel (fork by Ian Ozsvald)',
     author_email='',
-    url='http://github.com/BonsaiDen/twitter-text-python',
+    url='https://github.com/ianozsvald/twitter-text-python',
     license='GPL',
     py_modules=['ttp'],
     include_package_data=True,

From be4d2e35c1a17ac2a8a8e904c62df25cb964b01f Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Wed, 12 Sep 2012 21:29:15 -0700
Subject: [PATCH 10/38] first

---
 __init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 __init__.py

diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..e69de29

From 489ca0461d3cbecd3f78d3098c785325940e298d Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:12:19 +0000
Subject: [PATCH 11/38] preparing for V1.0.0 release

---
 README.rst | 92 ++++++++++++++++++++++++++++++++++++++++--------------
 setup.py   | 18 +++++------
 ttp.py     |  2 ++
 3 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/README.rst b/README.rst
index 06410ff..7cf7840 100644
--- a/README.rst
+++ b/README.rst
@@ -1,7 +1,7 @@
 twitter-text-python
 ===================
 
-**twitter-text-python** is a Tweet parser and formatter for Python.
+**twitter-text-python** is a Tweet parser and formatter for Python. Extract users, hashtags, URLs and format as HTML for display.
 
 It is based on twitter-text-java_ and passes all the unittests of 
 twitter-text-conformance_ plus some additional ones.
@@ -9,32 +9,67 @@ twitter-text-conformance_ plus some additional ones.
 .. _twitter-text-java: http://github.com/mzsanford/twitter-text-java
 .. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance
 
-UPDATE - forked by Ian Ozsvald, some bugs fixed, few minor changes to functionality added:
+This version was forked by Ian Ozsvald in January 2013 and released to PyPI, some bugs were fixed, a few minor changes to functionality added:
 https://github.com/ianozsvald/twitter-text-python
 
-The original ttp comes from:
+PyPI release:
+http://pypi.python.org/pypi/twitter-text-python/
+
+The original ttp comes from Ivo Wetzel (Ivo's version no longer supported):
 https://github.com/BonsaiDen/twitter-text-python
 
 Usage::
 
     >>> import ttp
     >>> p = ttp.Parser()
-    >>> result = p.parse("@BonsaiDen Hey that's a great Tweet parser! #twp")
+    >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/")
     >>> result.reply
-    'BonsaiDen'
+    'ianozsvald'
     >>> result.users
-    ['BonsaiDen']
+    ['ianozsvald']
     >>> result.tags
-    ['twp']
+    ['IvoWertzel']
     >>> result.urls
-    []
+    ['https://github.com/ianozsvald/']
     >>> result.html
-    u'<a href="http://twitter.com/BonsaiDen">@BonsaiDen</a> Hey that\'s a great Tweet Parser! 
-    <a href="http://search.twitter.com/search?q=%23twp">#twp</a>'
-
+    u'<a href="http://twitter.com/ianozsvald">@ianozsvald</a>, you now support <a href="http://search.twitter.com/search?q=%23IvoWertzel">#IvoWertzel</a>\'s tweet parser! <a href="https://github.com/ianozsvald/">https://github.com/ianozsvald/</a>'
 
 If you need different HTML output just subclass and override the ``format_*`` methods.
 
+You can also ask for the span tags to be returned for each entity::
+
+    >>> p = ttp.Parser(include_spans=True)
+    >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/")
+    >>> result.urls
+    [('https://github.com/ianozsvald/', (57, 87))]
+
+
+
+Installation
+------------
+
+    $ pip install twitter-text-python  # via: http://pypi.python.org/pypi/twitter-text-python
+    $ python
+    >>> import ttp
+    >>> ttp.__version__
+    '1.0.0'
+
+
+Changelog
+---------
+
+ * 2013/2/11 1.0.0 released to PyPI
+
+
+Tests
+-----
+
+    $ python tests.py
+    .................................................................................................
+    ----------------------------------------------------------------------
+    Ran 97 tests in 0.009s
+    OK
+
 
 Contributing
 ------------
@@ -43,23 +78,32 @@ The source is available on GitHub_, to
 contribute to the project, fork it on GitHub and send a pull request.
 Everyone is welcome to make improvements to **twp**!
 
-.. _GitHub: http://github.com/BonsaiDen/twitter-text-python
+.. _GitHub: https://github.com/ianozsvald/twitter-text-python
+
 
 License
-=======
+-------
+
+*MIT*
 
-Copyright (c) 2010 Ivo Wetzel
+Copyright (c) 2012 Ivo Wetzel.
 
-**twitter-text-python** is free software: you can redistribute it and/or 
-modify it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
 
-**twitter-text-python** is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
 
-You should have received a copy of the GNU General Public License along with
-**twitter-text-python**. If not, see <http://www.gnu.org/licenses/>.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
 
+Copyright (c) 2010-2013 Ivo Wetzel
diff --git a/setup.py b/setup.py
index a2d99b1..8d1305a 100644
--- a/setup.py
+++ b/setup.py
@@ -3,21 +3,21 @@
 setup(
     name='twitter-text-python',
     version='1.0',
-    description='Tweet parser and formatter',
-    long_description=open('README.rst').read(),
-    author='Ivo Wetzel (fork by Ian Ozsvald)',
-    author_email='',
+    description='Twitter Tweet parser and formatter',
+    long_description="no long description", #open('README.rst').read(),
+    author='Maintained by Ian Ozsvald (originally by Ivo Wetzel)',
+    author_email='ian@ianozsvald.com',
     url='https://github.com/ianozsvald/twitter-text-python',
-    license='GPL',
-    py_modules=['ttp'],
+    license='MIT',
+    py_modules=['ttp', 'tests'],
     include_package_data=True,
     zip_safe=False,
     install_requires=[],
+    #data_files=[('./', ['README.rst'])],
     classifiers=[
-        'Environment :: Web Environment',
-        # I don't know what exactly this means, but why not?
+        'Environment :: Console',
         'Intended Audience :: Developers',
-        'License :: OSI Approved :: BSD License',
+        #'License :: OSI Approved :: GPL License',
         'Operating System :: OS Independent',
         'Programming Language :: Python',
         'Topic :: Software Development :: Libraries :: Python Modules',
diff --git a/ttp.py b/ttp.py
index 8e68bf6..b599d6d 100644
--- a/ttp.py
+++ b/ttp.py
@@ -25,6 +25,8 @@
 import re
 import urllib
 
+__version__ = "1.0.0"
+
 # Some of this code has been translated from the twitter-text-java library:
 # <http://github.com/mzsanford/twitter-text-java>
 AT_SIGNS = ur'[@\uff20]'

From e2c57a50ad7b2f8efadf398972cf2ce37e34d28f Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:15:42 +0000
Subject: [PATCH 12/38] weird formatting bug

---
 README.rst | 101 -----------------------------------------------------
 1 file changed, 101 deletions(-)

diff --git a/README.rst b/README.rst
index 7cf7840..3c04032 100644
--- a/README.rst
+++ b/README.rst
@@ -6,104 +6,3 @@ twitter-text-python
 It is based on twitter-text-java_ and passes all the unittests of 
 twitter-text-conformance_ plus some additional ones.
 
-.. _twitter-text-java: http://github.com/mzsanford/twitter-text-java
-.. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance
-
-This version was forked by Ian Ozsvald in January 2013 and released to PyPI, some bugs were fixed, a few minor changes to functionality added:
-https://github.com/ianozsvald/twitter-text-python
-
-PyPI release:
-http://pypi.python.org/pypi/twitter-text-python/
-
-The original ttp comes from Ivo Wetzel (Ivo's version no longer supported):
-https://github.com/BonsaiDen/twitter-text-python
-
-Usage::
-
-    >>> import ttp
-    >>> p = ttp.Parser()
-    >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/")
-    >>> result.reply
-    'ianozsvald'
-    >>> result.users
-    ['ianozsvald']
-    >>> result.tags
-    ['IvoWertzel']
-    >>> result.urls
-    ['https://github.com/ianozsvald/']
-    >>> result.html
-    u'<a href="http://twitter.com/ianozsvald">@ianozsvald</a>, you now support <a href="http://search.twitter.com/search?q=%23IvoWertzel">#IvoWertzel</a>\'s tweet parser! <a href="https://github.com/ianozsvald/">https://github.com/ianozsvald/</a>'
-
-If you need different HTML output just subclass and override the ``format_*`` methods.
-
-You can also ask for the span tags to be returned for each entity::
-
-    >>> p = ttp.Parser(include_spans=True)
-    >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/")
-    >>> result.urls
-    [('https://github.com/ianozsvald/', (57, 87))]
-
-
-
-Installation
-------------
-
-    $ pip install twitter-text-python  # via: http://pypi.python.org/pypi/twitter-text-python
-    $ python
-    >>> import ttp
-    >>> ttp.__version__
-    '1.0.0'
-
-
-Changelog
----------
-
- * 2013/2/11 1.0.0 released to PyPI
-
-
-Tests
------
-
-    $ python tests.py
-    .................................................................................................
-    ----------------------------------------------------------------------
-    Ran 97 tests in 0.009s
-    OK
-
-
-Contributing
-------------
-
-The source is available on GitHub_, to
-contribute to the project, fork it on GitHub and send a pull request.
-Everyone is welcome to make improvements to **twp**!
-
-.. _GitHub: https://github.com/ianozsvald/twitter-text-python
-
-
-License
--------
-
-*MIT*
-
-Copyright (c) 2012 Ivo Wetzel.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-
-Copyright (c) 2010-2013 Ivo Wetzel

From 2ae04ff21c3492372ae7b492c34caad1536bd7e9 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:16:19 +0000
Subject: [PATCH 13/38] weird formatting bug

---
 README.rst | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.rst b/README.rst
index 3c04032..00227fa 100644
--- a/README.rst
+++ b/README.rst
@@ -6,3 +6,15 @@ twitter-text-python
 It is based on twitter-text-java_ and passes all the unittests of 
 twitter-text-conformance_ plus some additional ones.
 
+.. _twitter-text-java: http://github.com/mzsanford/twitter-text-java
+.. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance
+
+This version was forked by Ian Ozsvald in January 2013 and released to PyPI, some bugs were fixed, a few minor changes to functionality added:
+https://github.com/ianozsvald/twitter-text-python
+
+PyPI release:
+http://pypi.python.org/pypi/twitter-text-python/
+
+The original ttp comes from Ivo Wetzel (Ivo's version no longer supported):
+https://github.com/BonsaiDen/twitter-text-python
+

From c8e40cd24c7219eba192f598870561025434a2f2 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:16:44 +0000
Subject: [PATCH 14/38] weird formatting bug

---
 README.rst | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/README.rst b/README.rst
index 00227fa..9e3d406 100644
--- a/README.rst
+++ b/README.rst
@@ -18,3 +18,19 @@ http://pypi.python.org/pypi/twitter-text-python/
 The original ttp comes from Ivo Wetzel (Ivo's version no longer supported):
 https://github.com/BonsaiDen/twitter-text-python
 
+Usage::
+
+    >>> import ttp
+    >>> p = ttp.Parser()
+    >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/")
+    >>> result.reply
+    'ianozsvald'
+    >>> result.users
+    ['ianozsvald']
+    >>> result.tags
+    ['IvoWertzel']
+    >>> result.urls
+    ['https://github.com/ianozsvald/']
+    >>> result.html
+    u'<a href="http://twitter.com/ianozsvald">@ianozsvald</a>, you now support <a href="http://search.twitter.com/search?q=%23IvoWertzel">#IvoWertzel</a>\'s tweet parser! <a href="https://github.com/ianozsvald/">https://github.com/ianozsvald/</a>'
+

From 77ff625ed016efe28d0087d301e66238d7fb81b0 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:17:14 +0000
Subject: [PATCH 15/38] weird formatting bug

---
 README.rst | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/README.rst b/README.rst
index 9e3d406..51219ec 100644
--- a/README.rst
+++ b/README.rst
@@ -34,3 +34,49 @@ Usage::
     >>> result.html
     u'<a href="http://twitter.com/ianozsvald">@ianozsvald</a>, you now support <a href="http://search.twitter.com/search?q=%23IvoWertzel">#IvoWertzel</a>\'s tweet parser! <a href="https://github.com/ianozsvald/">https://github.com/ianozsvald/</a>'
 
+If you need different HTML output just subclass and override the ``format_*`` methods.
+
+You can also ask for the span tags to be returned for each entity::
+
+    >>> p = ttp.Parser(include_spans=True)
+    >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/")
+    >>> result.urls
+    [('https://github.com/ianozsvald/', (57, 87))]
+
+
+
+Installation
+------------
+
+    $ pip install twitter-text-python  # via: http://pypi.python.org/pypi/twitter-text-python
+    $ python
+    >>> import ttp
+    >>> ttp.__version__
+    '1.0.0'
+
+
+Changelog
+---------
+
+ * 2013/2/11 1.0.0 released to PyPI
+
+
+Tests
+-----
+
+    $ python tests.py
+    .................................................................................................
+    ----------------------------------------------------------------------
+    Ran 97 tests in 0.009s
+    OK
+
+
+Contributing
+------------
+
+The source is available on GitHub_, to
+contribute to the project, fork it on GitHub and send a pull request.
+Everyone is welcome to make improvements to **twp**!
+
+.. _GitHub: https://github.com/ianozsvald/twitter-text-python
+    

From 4297316f2e8a31cd6d6f4335ec444b22369795bb Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:18:05 +0000
Subject: [PATCH 16/38] weird formatting bug

---
 README.rst | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/README.rst b/README.rst
index 51219ec..1ce4b94 100644
--- a/README.rst
+++ b/README.rst
@@ -58,7 +58,7 @@ Installation
 Changelog
 ---------
 
- * 2013/2/11 1.0.0 released to PyPI
+2013/2/11 1.0.0 released to PyPI
 
 
 Tests
@@ -70,13 +70,3 @@ Tests
     Ran 97 tests in 0.009s
     OK
 
-
-Contributing
-------------
-
-The source is available on GitHub_, to
-contribute to the project, fork it on GitHub and send a pull request.
-Everyone is welcome to make improvements to **twp**!
-
-.. _GitHub: https://github.com/ianozsvald/twitter-text-python
-    

From 22c73a9430065d013450fc1cbe5e38088c237c86 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:18:39 +0000
Subject: [PATCH 17/38] weird formatting bug

---
 README.rst | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/README.rst b/README.rst
index 1ce4b94..987c767 100644
--- a/README.rst
+++ b/README.rst
@@ -55,18 +55,3 @@ Installation
     '1.0.0'
 
 
-Changelog
----------
-
-2013/2/11 1.0.0 released to PyPI
-
-
-Tests
------
-
-    $ python tests.py
-    .................................................................................................
-    ----------------------------------------------------------------------
-    Ran 97 tests in 0.009s
-    OK
-

From e2e36155ab93586ceda15f84b6f7069ca35547f4 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:19:29 +0000
Subject: [PATCH 18/38] weird formatting bug

---
 README.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 987c767..4cd9fd0 100644
--- a/README.rst
+++ b/README.rst
@@ -48,7 +48,10 @@ You can also ask for the span tags to be returned for each entity::
 Installation
 ------------
 
-    $ pip install twitter-text-python  # via: http://pypi.python.org/pypi/twitter-text-python
+pip and easy_install will do the job::
+
+    # via: http://pypi.python.org/pypi/twitter-text-python
+    $ pip install twitter-text-python  
     $ python
     >>> import ttp
     >>> ttp.__version__

From 4b8121cf335e238c4e80052c119b1413c7394d9b Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:20:47 +0000
Subject: [PATCH 19/38] weird formatting bug

---
 README.rst | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/README.rst b/README.rst
index 4cd9fd0..b9c08d2 100644
--- a/README.rst
+++ b/README.rst
@@ -57,4 +57,23 @@ pip and easy_install will do the job::
     >>> ttp.__version__
     '1.0.0'
 
+Changelog
+---------
+
+Release history::
+
+    * 2013/2/11 1.0.0 released to PyPI
+
+
+Tests
+-----
+
+Checkout the code via github https://github.com/ianozsvald/twitter-text-python and run tests locally::
+
+    $ python tests.py
+    .................................................................................................
+    ----------------------------------------------------------------------
+    Ran 97 tests in 0.009s
+    OK
+
 

From c024c5803f7ac81704baadfbe3ce11b9501fc1c2 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:21:14 +0000
Subject: [PATCH 20/38] weird formatting bug

---
 README.rst | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index b9c08d2..118f820 100644
--- a/README.rst
+++ b/README.rst
@@ -60,9 +60,7 @@ pip and easy_install will do the job::
 Changelog
 ---------
 
-Release history::
-
-    * 2013/2/11 1.0.0 released to PyPI
+ * 2013/2/11 1.0.0 released to PyPI
 
 
 Tests

From 9b86dc1dfc0e7cfdb8e6b114416c36a22a1051d5 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:21:44 +0000
Subject: [PATCH 21/38] weird formatting bug

---
 README.rst | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/README.rst b/README.rst
index 118f820..ef3c921 100644
--- a/README.rst
+++ b/README.rst
@@ -74,4 +74,40 @@ Checkout the code via github https://github.com/ianozsvald/twitter-text-python a
     Ran 97 tests in 0.009s
     OK
 
+Contributing
+------------
+
+The source is available on GitHub_, to
+contribute to the project, fork it on GitHub and send a pull request.
+Everyone is welcome to make improvements to **twp**!
+
+.. _GitHub: https://github.com/ianozsvald/twitter-text-python
+
+
+License
+-------
+
+*MIT*
+
+Copyright (c) 2012 Ivo Wetzel.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+Copyright (c) 2010-2013 Ivo Wetzel
 

From 400758b5cbe18d4a3b240261cca514952c0caa38 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 21:30:01 +0000
Subject: [PATCH 22/38] minor

---
 setup.py | 5 +++--
 ttp.py   | 2 --
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 8d1305a..9f00854 100644
--- a/setup.py
+++ b/setup.py
@@ -2,9 +2,10 @@
 
 setup(
     name='twitter-text-python',
-    version='1.0',
+    version='1.0.0',
     description='Twitter Tweet parser and formatter',
-    long_description="no long description", #open('README.rst').read(),
+    long_description="Extract @users, #hashtags and URLs from tweets including entity locations, also generate HTML for output. Visit the github site for full instructions.",
+    #open('README.rst').read(),
     author='Maintained by Ian Ozsvald (originally by Ivo Wetzel)',
     author_email='ian@ianozsvald.com',
     url='https://github.com/ianozsvald/twitter-text-python',
diff --git a/ttp.py b/ttp.py
index b599d6d..5194297 100644
--- a/ttp.py
+++ b/ttp.py
@@ -13,8 +13,6 @@
 #  You should have received a copy of the GNU General Public License along with
 #  twitter-text-python. If not, see <http://www.gnu.org/licenses/>.
 
-# TODO create a setup.py
-
 # Forked by Ian Ozsvald:
 # https://github.com/ianozsvald/twitter-text-python
 # from:

From bdf73168bd859be69cb936456eab46915ee1d3c6 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 11 Feb 2013 22:08:35 +0000
Subject: [PATCH 23/38] version bump after fixing up setup.py to use a
 subdirectory

---
 README.rst                     | 8 ++++----
 setup.py                       | 8 ++++----
 __init__.py => ttp/__init__.py | 0
 tests.py => ttp/tests.py       | 0
 ttp.py => ttp/ttp.py           | 2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)
 rename __init__.py => ttp/__init__.py (100%)
 rename tests.py => ttp/tests.py (100%)
 rename ttp.py => ttp/ttp.py (99%)

diff --git a/README.rst b/README.rst
index ef3c921..ccfcd33 100644
--- a/README.rst
+++ b/README.rst
@@ -20,7 +20,7 @@ https://github.com/BonsaiDen/twitter-text-python
 
 Usage::
 
-    >>> import ttp
+    >>> from ttp import ttp
     >>> p = ttp.Parser()
     >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/")
     >>> result.reply
@@ -53,14 +53,14 @@ pip and easy_install will do the job::
     # via: http://pypi.python.org/pypi/twitter-text-python
     $ pip install twitter-text-python  
     $ python
-    >>> import ttp
+    >>> from ttp import ttp
     >>> ttp.__version__
-    '1.0.0'
+    '1.0.0.2'
 
 Changelog
 ---------
 
- * 2013/2/11 1.0.0 released to PyPI
+ * 2013/2/11 1.0.0.2 released to PyPI
 
 
 Tests
diff --git a/setup.py b/setup.py
index 9f00854..60070b1 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='twitter-text-python',
-    version='1.0.0',
+    version='1.0.0.2',
     description='Twitter Tweet parser and formatter',
     long_description="Extract @users, #hashtags and URLs from tweets including entity locations, also generate HTML for output. Visit the github site for full instructions.",
     #open('README.rst').read(),
@@ -10,17 +10,17 @@
     author_email='ian@ianozsvald.com',
     url='https://github.com/ianozsvald/twitter-text-python',
     license='MIT',
-    py_modules=['ttp', 'tests'],
+    packages=['ttp'],
     include_package_data=True,
     zip_safe=False,
     install_requires=[],
-    #data_files=[('./', ['README.rst'])],
     classifiers=[
         'Environment :: Console',
         'Intended Audience :: Developers',
-        #'License :: OSI Approved :: GPL License',
+        'License :: OSI Approved :: MIT License',
         'Operating System :: OS Independent',
         'Programming Language :: Python',
         'Topic :: Software Development :: Libraries :: Python Modules',
+        'Topic :: Text Processing :: Linguistic',
     ]
 )
diff --git a/__init__.py b/ttp/__init__.py
similarity index 100%
rename from __init__.py
rename to ttp/__init__.py
diff --git a/tests.py b/ttp/tests.py
similarity index 100%
rename from tests.py
rename to ttp/tests.py
diff --git a/ttp.py b/ttp/ttp.py
similarity index 99%
rename from ttp.py
rename to ttp/ttp.py
index 5194297..2996038 100644
--- a/ttp.py
+++ b/ttp/ttp.py
@@ -23,7 +23,7 @@
 import re
 import urllib
 
-__version__ = "1.0.0"
+__version__ = "1.0.0.2"
 
 # Some of this code has been translated from the twitter-text-java library:
 # <http://github.com/mzsanford/twitter-text-java>

From 52c61013ce1fedfe2fc640dd48f93f278beeb4a2 Mon Sep 17 00:00:00 2001
From: Lee Semel <lee@semel.net>
Date: Mon, 25 Mar 2013 10:27:04 -0400
Subject: [PATCH 24/38] Fix t.co urls followed by a comma

---
 ttp/tests.py | 5 +++++
 ttp/ttp.py   | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/ttp/tests.py b/ttp/tests.py
index 26fb373..4d01e1d 100644
--- a/ttp/tests.py
+++ b/ttp/tests.py
@@ -231,6 +231,11 @@ def test_url_followed_comma(self):
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>,')
         self.assertEqual(result.urls, [u'http://example.com'])
 
+    def test_url_with_path_followed_comma(self):
+        result = self.parser.parse(u'text http://example.com/abcde, more')
+        self.assertEqual(result.html, u'text <a href="http://example.com/abcde">http://example.com/abcde</a>, more')
+        self.assertEqual(result.urls, [u'http://example.com/abcde'])
+
     def test_url_followed_brace(self):
         result = self.parser.parse(u'text http://example.com)')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>)')
diff --git a/ttp/ttp.py b/ttp/ttp.py
index 2996038..b8f8404 100644
--- a/ttp/ttp.py
+++ b/ttp/ttp.py
@@ -59,7 +59,7 @@
 PATH_ENDING_CHARS = r'[%s\)=#/]' % UTF_CHARS
 QUERY_ENDING_CHARS = '[a-z0-9_&=#]'
 
-URL_REGEX = re.compile('((%s)((https?://|www\\.)(%s)(\/%s*%s?)?(\?%s*%s)?))'
+URL_REGEX = re.compile('((%s)((https?://|www\\.)(%s)(\/(%s*%s)?)?(\?%s*%s)?))'
                        % (PRE_CHARS, DOMAIN_CHARS, PATH_CHARS,
                           PATH_ENDING_CHARS, QUERY_CHARS, QUERY_ENDING_CHARS),
                           re.IGNORECASE)

From a9973f9cb3456bf45512fb41c343285b118b8d22 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Tue, 26 Mar 2013 11:23:53 +0000
Subject: [PATCH 25/38] added some notes for TODO

---
 README.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.rst b/README.rst
index ccfcd33..5be038d 100644
--- a/README.rst
+++ b/README.rst
@@ -84,6 +84,13 @@ Everyone is welcome to make improvements to **twp**!
 .. _GitHub: https://github.com/ianozsvald/twitter-text-python
 
 
+Todo
+----
+
+  * Consider adding capitalised phrase identification
+  * Make it 1 line to parse and get a results dict via __init__.py
+  * Tag the next release
+
 License
 -------
 

From 19e2368e3e41cd769f733e5db1a1c713419ecb83 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Thu, 28 Mar 2013 14:52:31 +0000
Subject: [PATCH 26/38] bump of version nbr for this new working version, added
 a shortlink follower in utils.py

---
 README.rst   |  9 +++++++++
 ttp/ttp.py   |  2 +-
 ttp/utils.py | 28 ++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 ttp/utils.py

diff --git a/README.rst b/README.rst
index 5be038d..f7cf047 100644
--- a/README.rst
+++ b/README.rst
@@ -44,6 +44,14 @@ You can also ask for the span tags to be returned for each entity::
     [('https://github.com/ianozsvald/', (57, 87))]
 
 
+To use the shortlink follower:
+
+    >>> from ttp import utils
+    >>> # assume that result.urls == ['http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF']
+    >>> print utils.follow_shortlinks(result.urls)  # pass in list of shortlink URLs
+    {'http://t.co/8o0z9BbEMu': [u'http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF', u'http://www.bbc.co.uk/sport/0/21711199#TWEET650562'], u'http://bbc.in/16dClPF': [u'http://bbc.in/16dClPF', u'http://www.bbc.co.uk/sport/0/21711199#TWEET650562']}
+     >>> # note that bad shortlink URLs have a key to an empty list (lost/forgotten shortlink URLs don't generate any error)
+
 
 Installation
 ------------
@@ -61,6 +69,7 @@ Changelog
 ---------
 
  * 2013/2/11 1.0.0.2 released to PyPI
+ * 2013/4/? 1.0.1 new working version
 
 
 Tests
diff --git a/ttp/ttp.py b/ttp/ttp.py
index 2996038..4b7cb83 100644
--- a/ttp/ttp.py
+++ b/ttp/ttp.py
@@ -23,7 +23,7 @@
 import re
 import urllib
 
-__version__ = "1.0.0.2"
+__version__ = "1.0.1.0"
 
 # Some of this code has been translated from the twitter-text-java library:
 # <http://github.com/mzsanford/twitter-text-java>
diff --git a/ttp/utils.py b/ttp/utils.py
new file mode 100644
index 0000000..2c3d822
--- /dev/null
+++ b/ttp/utils.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Unwind short-links e.g. bit.ly, t.co etc to their canonical links"""
+import requests
+
+
+def follow_shortlinks(shortlinks):
+    """Follow redirects in list of shortlinks, return dict of resulting URLs"""
+    links_followed = {}
+    for shortlink in shortlinks:
+        url = shortlink
+        request_result = requests.get(url)
+        redirect_history = request_result.history
+        # history might look like:
+        # (<Response [301]>, <Response [301]>)
+        # where each response object has a URL
+        all_urls = []
+        for redirect in redirect_history:
+            all_urls.append(redirect.url)
+        # append the final URL that we finish with
+        all_urls.append(request_result.url)
+        links_followed[shortlink] = all_urls
+    return links_followed
+
+
+if __name__ == "__main__":
+    shortlinks = ['http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF']
+    print follow_shortlinks(shortlinks)

From 1bab751f5208aa0c8f4b309d44ed94c87478a13e Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Thu, 28 Mar 2013 15:10:33 +0000
Subject: [PATCH 27/38] added requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..6a99645
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+requests==1.1.0

From dd4e9322d01985b69e84c51e47ac063570d4170e Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Thu, 4 Apr 2013 21:59:03 +0100
Subject: [PATCH 28/38] adding some , parsing

---
 README.rst   |  1 +
 ttp/tests.py | 10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/README.rst b/README.rst
index f7cf047..44da770 100644
--- a/README.rst
+++ b/README.rst
@@ -97,6 +97,7 @@ Todo
 ----
 
   * Consider adding capitalised phrase identification
+  * Consider adding a repeated-char remover (e.g. grrrrrrr->grr)
   * Make it 1 line to parse and get a results dict via __init__.py
   * Tag the next release
 
diff --git a/ttp/tests.py b/ttp/tests.py
index 4d01e1d..d302537 100644
--- a/ttp/tests.py
+++ b/ttp/tests.py
@@ -231,11 +231,21 @@ def test_url_followed_comma(self):
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>,')
         self.assertEqual(result.urls, [u'http://example.com'])
 
+    def test_url_with_path_preceeded_by_comma(self):
+        result = self.parser.parse(u'text ,http://example.com/abcde, more')
+        self.assertEqual(result.html, u'text ,<a href="http://example.com/abcde">http://example.com/abcde</a>, more')
+        self.assertEqual(result.urls, [u'http://example.com/abcde'])
+
     def test_url_with_path_followed_comma(self):
         result = self.parser.parse(u'text http://example.com/abcde, more')
         self.assertEqual(result.html, u'text <a href="http://example.com/abcde">http://example.com/abcde</a>, more')
         self.assertEqual(result.urls, [u'http://example.com/abcde'])
 
+    def test_url_with_path_followed_commas(self):
+        result = self.parser.parse(u'text http://example.com/abcde,, more')
+        self.assertEqual(result.html, u'text <a href="http://example.com/abcde">http://example.com/abcde</a>,, more')
+        self.assertEqual(result.urls, [u'http://example.com/abcde'])
+
     def test_url_followed_brace(self):
         result = self.parser.parse(u'text http://example.com)')
         self.assertEqual(result.html, u'text <a href="http://example.com">http://example.com</a>)')

From 4b2d7a02e5fb149f8200ca14597e83f00c62b273 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Thu, 4 Apr 2013 22:00:57 +0100
Subject: [PATCH 29/38] extra note on how to run tests

---
 README.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.rst b/README.rst
index 44da770..a8da4cc 100644
--- a/README.rst
+++ b/README.rst
@@ -69,7 +69,7 @@ Changelog
 ---------
 
  * 2013/2/11 1.0.0.2 released to PyPI
- * 2013/4/? 1.0.1 new working version
+ * 2013/4/? 1.0.1 new working version, adding comma parse fix (thanks https://github.com/muckrack)
 
 
 Tests
@@ -77,12 +77,13 @@ Tests
 
 Checkout the code via github https://github.com/ianozsvald/twitter-text-python and run tests locally::
 
-    $ python tests.py
-    .................................................................................................
+    $ python ttp/tests.py 
+    ....................................................................................................
     ----------------------------------------------------------------------
-    Ran 97 tests in 0.009s
+    Ran 100 tests in 0.009s
     OK
 
+
 Contributing
 ------------
 

From f80d89c5d86873bddda5071346ac1b8848103e80 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Sat, 1 Jun 2013 13:33:40 +0100
Subject: [PATCH 30/38] used autopep8 to clean up the src

---
 ttp/tests.py | 38 ++++++++++++++++----------------------
 ttp/ttp.py   | 17 ++++++++---------
 2 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/ttp/tests.py b/ttp/tests.py
index d302537..39aa5ab 100644
--- a/ttp/tests.py
+++ b/ttp/tests.py
@@ -22,10 +22,10 @@
 
 
 class TWPTests(unittest.TestCase):
+
     def setUp(self):
         self.parser = ttp.Parser()
 
-
     # General Tests ------------------------------------------------------------
     # --------------------------------------------------------------------------
     def test_urls(self):
@@ -45,7 +45,6 @@ def test_all_not_break_url_at(self):
         self.assertEqual(result.html, u'<a href="http://www.flickr.com/photos/29674651@N00/4382024406">http://www.flickr.com/photo...</a>')
         self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406'])
 
-
     # URL tests ----------------------------------------------------------------
     # --------------------------------------------------------------------------
     def test_url_mid(self):
@@ -90,7 +89,8 @@ def test_url_dash(self):
 
     def test_url_multiple(self):
         result = self.parser.parse(u'http://example.com https://sslexample.com http://sub.example.com')
-        self.assertEqual(result.html, u'<a href="http://example.com">http://example.com</a> <a href="https://sslexample.com">https://sslexample.com</a> <a href="http://sub.example.com">http://sub.example.com</a>')
+        self.assertEqual(
+            result.html, u'<a href="http://example.com">http://example.com</a> <a href="https://sslexample.com">https://sslexample.com</a> <a href="http://sub.example.com">http://sub.example.com</a>')
         self.assertEqual(result.urls, [u'http://example.com', u'https://sslexample.com', u'http://sub.example.com'])
 
     def test_url_raw_domain(self):
@@ -162,7 +162,6 @@ def test_url_long_hypens(self):
         self.assertEqual(result.html, u'text <a href="http://word-and-a-number-8-ftw.domain.tld/">http://word-and-a-number-8-...</a>')
         self.assertEqual(result.urls, [u'http://word-and-a-number-8-ftw.domain.tld/'])
 
-
     # URL not tests ------------------------------------------------------------
     def test_not_url_dotdotdot(self):
         result = self.parser.parse(u'Is www...foo a valid URL?')
@@ -194,7 +193,6 @@ def test_not_url_one_letter_iana(self):
         self.assertEqual(result.html, u'text http://a.com/ http://a.net/ http://a.org/')
         self.assertEqual(result.urls, [])
 
-
     # URL followed Tests -------------------------------------------------------
     def test_url_followed_question(self):
         result = self.parser.parse(u'text http://example.com?')
@@ -271,7 +269,6 @@ def test_url_followed_hypen(self):
         self.assertEqual(result.html, u'text <a href="http://domain.tld">http://domain.tld</a>-that-you-should-have-put-a-space-after')
         self.assertEqual(result.urls, [u'http://domain.tld'])
 
-
     # URL preceeded Tests -------------------------------------------------------
     def test_url_preceeded_colon(self):
         result = self.parser.parse(u'text:http://example.com')
@@ -294,7 +291,6 @@ def test_not_url_preceeded_exclamation(self):
         self.assertEqual(result.html, u'text !http://example.com')
         self.assertEqual(result.urls, [])
 
-
     # URL numeric tests --------------------------------------------------------
     def test_url_at_numeric(self):
         result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406')
@@ -306,7 +302,6 @@ def test_url_at_non_numeric(self):
         self.assertEqual(result.html, u'<a href="http://www.flickr.com/photos/29674651@N00/foobar">http://www.flickr.com/photo...</a>')
         self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/foobar'])
 
-
     # URL domain tests ---------------------------------------------------------
     def test_url_WWW(self):
         result = self.parser.parse(u'WWW.EXAMPLE.COM')
@@ -320,7 +315,8 @@ def test_url_www(self):
 
     def test_url_only_domain_query_followed_period(self):
         result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.i.want.it. Even when they contain a URL.')
-        self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period <a href="http://tell.me/why?=because.i.want.it">http://tell.me/why?=because...</a>. Even when they contain a URL.')
+        self.assertEqual(
+            result.html, u'I think it\'s proper to end sentences with a period <a href="http://tell.me/why?=because.i.want.it">http://tell.me/why?=because...</a>. Even when they contain a URL.')
         self.assertEqual(result.urls, [u'http://tell.me/why?=because.i.want.it'])
 
     def test_url_only_domain_followed_period(self):
@@ -354,7 +350,6 @@ def test_not_url_under_domain(self):
         self.assertEqual(result.html, u'badly formatted http://foo_bar.com')
         self.assertEqual(result.urls, [])
 
-
     # Hashtag tests ------------------------------------------------------------
     # --------------------------------------------------------------------------
     def test_hashtag_followed_full_whitespace(self):
@@ -432,7 +427,6 @@ def test_hashtag_under(self):
         self.assertEqual(result.html, u'text <a href="http://search.twitter.com/search?q=%23hash_tag">#hash_tag</a>')
         self.assertEqual(result.tags, [u'hash_tag'])
 
-
     # Username tests -----------------------------------------------------------
     # --------------------------------------------------------------------------
     def test_not_username_preceded_letter(self):
@@ -515,7 +509,6 @@ def test_username_non_reply(self):
         self.assertEqual(result.users, [u'username'])
         self.assertEqual(result.reply, None)
 
-
     # List tests ---------------------------------------------------------------
     # --------------------------------------------------------------------------
     def test_list_preceeded(self):
@@ -561,7 +554,8 @@ def test_list_not_preceeded_letter(self):
 
     def test_list_long_truncate(self):
         result = self.parser.parse(u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A')
-        self.assertEqual(result.html, u'<a href="http://twitter.com/username/list5678901234567890123456789012345678901234567890123456789012345678901234567890">@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890</a>A')
+        self.assertEqual(
+            result.html, u'<a href="http://twitter.com/username/list5678901234567890123456789012345678901234567890123456789012345678901234567890">@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890</a>A')
         self.assertEqual(result.lists, [(u'username', u'list5678901234567890123456789012345678901234567890123456789012345678901234567890')])
 
     def test_list_with_dash(self):
@@ -571,9 +565,10 @@ def test_list_with_dash(self):
 
 
 class TWPTestsWithSpans(unittest.TestCase):
+
     """Test ttp with re spans to extract character co-ords of matches"""
     def setUp(self):
-        self.parser = ttp.Parser(include_spans = True)
+        self.parser = ttp.Parser(include_spans=True)
 
     def test_spans_in_tweets(self):
         """Test some coca-cola tweets taken from twitter with spans"""
@@ -606,15 +601,14 @@ def test_edge_cases(self):
         self.assertEqual(result.urls, [(u'http://some.com', (1, 16))])
 
 
-
 # Test it!
 if __name__ == '__main__':
     unittest.main()
 
-    #verbosity = 0 # set to 2 for verbose output
-    #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpansEdgeCases)
-    #unittest.TextTestRunner(verbosity=verbosity).run(suite)
-    #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans)
-    #unittest.TextTestRunner(verbosity=verbosity).run(suite)
-    #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests)
-    #unittest.TextTestRunner(verbosity=verbosity).run(suite)
+    # verbosity = 0 # set to 2 for verbose output
+    # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpansEdgeCases)
+    # unittest.TextTestRunner(verbosity=verbosity).run(suite)
+    # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans)
+    # unittest.TextTestRunner(verbosity=verbosity).run(suite)
+    # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests)
+    # unittest.TextTestRunner(verbosity=verbosity).run(suite)
diff --git a/ttp/ttp.py b/ttp/ttp.py
index 1202f2c..ac7c79e 100644
--- a/ttp/ttp.py
+++ b/ttp/ttp.py
@@ -39,8 +39,8 @@
 
 # Users
 USERNAME_REGEX = re.compile(ur'\B' + AT_SIGNS + LIST_END_CHARS, re.IGNORECASE)
-REPLY_REGEX = re.compile(ur'^(?:' + SPACES + ur')*' + AT_SIGNS \
-              + ur'([a-z0-9_]{1,20}).*', re.IGNORECASE)
+REPLY_REGEX = re.compile(ur'^(?:' + SPACES + ur')*' + AT_SIGNS
+                         + ur'([a-z0-9_]{1,20}).*', re.IGNORECASE)
 
 # Hashtags
 HASHTAG_EXP = ur'(^|[^0-9A-Z&/]+)(#|\uff03)([0-9A-Z_]*[A-Z_]+[%s]*)' % UTF_CHARS
@@ -62,13 +62,14 @@
 URL_REGEX = re.compile('((%s)((https?://|www\\.)(%s)(\/(%s*%s)?)?(\?%s*%s)?))'
                        % (PRE_CHARS, DOMAIN_CHARS, PATH_CHARS,
                           PATH_ENDING_CHARS, QUERY_CHARS, QUERY_ENDING_CHARS),
-                          re.IGNORECASE)
+                       re.IGNORECASE)
 
 # Registered IANA one letter domains
 IANA_ONE_LETTER_DOMAINS = ('x.com', 'x.org', 'z.com', 'q.net', 'q.com', 'i.net')
 
 
 class ParseResult(object):
+
     '''A class containing the results of a parsed Tweet.
 
     Attributes:
@@ -109,9 +110,10 @@ def __init__(self, urls, users, reply, lists, tags, html):
 
 
 class Parser(object):
+
     '''A Tweet Parser'''
 
-    def __init__(self, max_url_length=30, include_spans = False):
+    def __init__(self, max_url_length=30, include_spans=False):
         self._max_url_length = max_url_length
         self._include_spans = include_spans
 
@@ -144,7 +146,6 @@ def _html(self, text):
         html = LIST_REGEX.sub(self._parse_lists, html)
         return HASHTAG_REGEX.sub(self._parse_tags, html)
 
-
     # Internal parser stuff ----------------------------------------------------
     def _parse_urls(self, match):
         '''Parse URLs.'''
@@ -186,7 +187,7 @@ def _parse_urls(self, match):
 
         if self._html:
             return '%s%s' % (pre, self.format_url(full_url,
-                                       self._shorten_url(escape(url))))
+                                                  self._shorten_url(escape(url))))
 
     def _parse_users(self, match):
         '''Parse usernames.'''
@@ -261,12 +262,11 @@ def _shorten_url(self, text):
         else:
             return text
 
-
     # User defined formatters --------------------------------------------------
     def format_tag(self, tag, text):
         '''Return formatted HTML for a hashtag.'''
         return '<a href="http://search.twitter.com/search?q=%s">%s%s</a>' \
-                % (urllib.quote('#' + text.encode('utf-8')), tag, text)
+            % (urllib.quote('#' + text.encode('utf-8')), tag, text)
 
     def format_username(self, at_char, user):
         '''Return formatted HTML for a username.'''
@@ -289,4 +289,3 @@ def escape(text):
     return ''.join({'&': '&amp;', '"': '&quot;',
                     '\'': '&apos;', '>': '&gt;',
                     '<': '&lt;'}.get(c, c) for c in text)
-

From 93f6985a4cf68d84d83e0ac30f1c550a0f3c318c Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Sat, 1 Jun 2013 13:43:54 +0100
Subject: [PATCH 31/38] minor

---
 README.rst | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index a8da4cc..c3f76ec 100644
--- a/README.rst
+++ b/README.rst
@@ -3,8 +3,8 @@ twitter-text-python
 
 **twitter-text-python** is a Tweet parser and formatter for Python. Extract users, hashtags, URLs and format as HTML for display.
 
-It is based on twitter-text-java_ and passes all the unittests of 
-twitter-text-conformance_ plus some additional ones.
+It is based on twitter-text-java_ and did pass all the unittests of 
+twitter-text-conformance_ plus some additional ones. Note that the conformance tests are now behind (easy PR for someone to work on: https://github.com/ianozsvald/twitter-text-python/issues/5 ):
 
 .. _twitter-text-java: http://github.com/mzsanford/twitter-text-java
 .. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance
@@ -69,7 +69,7 @@ Changelog
 ---------
 
  * 2013/2/11 1.0.0.2 released to PyPI
- * 2013/4/? 1.0.1 new working version, adding comma parse fix (thanks https://github.com/muckrack)
+ * 2013/6/1 1.0.1 new working version, adding comma parse fix (thanks https://github.com/muckrack), used autopep8 to clean the src, added a shortlink expander
 
 
 Tests
@@ -102,6 +102,12 @@ Todo
   * Make it 1 line to parse and get a results dict via __init__.py
   * Tag the next release
 
+Doing a release
+---------------
+
+In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt
+
+
 License
 -------
 

From e00cad8e67bc018c93a6f693a96634b26e903f12 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Sat, 1 Jun 2013 13:55:04 +0100
Subject: [PATCH 32/38] notes on pypi release and git tagging

---
 README.rst | 7 ++++++-
 setup.py   | 4 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index c3f76ec..15bd337 100644
--- a/README.rst
+++ b/README.rst
@@ -105,7 +105,12 @@ Todo
 Doing a release
 ---------------
 
-In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt
+In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form:
+
+  $ # edit setup.py to bump the version number
+  $ git tag -a v1.0.1 -m 'v1.0.1 release'
+  $ ianozsvald-twitter-text-python $ python setup.py sdist register  upload -r http://pypi.python.org/pypi
+  $ # this uses ~/.pypirc with cached login details
 
 
 License
diff --git a/setup.py b/setup.py
index 60070b1..2a9dd84 100644
--- a/setup.py
+++ b/setup.py
@@ -2,9 +2,9 @@
 
 setup(
     name='twitter-text-python',
-    version='1.0.0.2',
+    version='1.0.1',
     description='Twitter Tweet parser and formatter',
-    long_description="Extract @users, #hashtags and URLs from tweets including entity locations, also generate HTML for output. Visit the github site for full instructions.",
+    long_description="Extract @users, #hashtags and URLs (and unwind shortened links) from tweets including entity locations, also generate HTML for output. Visit https://github.com/ianozsvald/twitter-text-python for examples.",
     #open('README.rst').read(),
     author='Maintained by Ian Ozsvald (originally by Ivo Wetzel)',
     author_email='ian@ianozsvald.com',

From 07240991f5a4a54c671ea386ba0961ff56ab936b Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Sat, 1 Jun 2013 13:56:05 +0100
Subject: [PATCH 33/38] note on pushing tags

---
 README.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.rst b/README.rst
index 15bd337..1658b54 100644
--- a/README.rst
+++ b/README.rst
@@ -109,6 +109,7 @@ In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The shor
 
   $ # edit setup.py to bump the version number
   $ git tag -a v1.0.1 -m 'v1.0.1 release'
+  $ git push origin --tags
   $ ianozsvald-twitter-text-python $ python setup.py sdist register  upload -r http://pypi.python.org/pypi
   $ # this uses ~/.pypirc with cached login details
 

From 033a5abd173cde36378d5be4f1a2165573381a58 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Sat, 1 Jun 2013 13:57:12 +0100
Subject: [PATCH 34/38] cleanup

---
 README.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.rst b/README.rst
index 1658b54..6d3f315 100644
--- a/README.rst
+++ b/README.rst
@@ -107,11 +107,11 @@ Doing a release
 
 In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form:
 
-  $ # edit setup.py to bump the version number
-  $ git tag -a v1.0.1 -m 'v1.0.1 release'
-  $ git push origin --tags
-  $ ianozsvald-twitter-text-python $ python setup.py sdist register  upload -r http://pypi.python.org/pypi
-  $ # this uses ~/.pypirc with cached login details
+    $ # edit setup.py to bump the version number
+    $ git tag -a v1.0.1 -m 'v1.0.1 release'
+    $ git push origin --tags
+    $ ianozsvald-twitter-text-python $ python setup.py sdist register  upload -r http://pypi.python.org/pypi
+    $ # this uses ~/.pypirc with cached login details
 
 
 License

From 66c209bdd53b6c27a9f509f03fbd952cecc69cce Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Sat, 1 Jun 2013 13:58:10 +0100
Subject: [PATCH 35/38] cleanup

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 6d3f315..8935b66 100644
--- a/README.rst
+++ b/README.rst
@@ -107,7 +107,7 @@ Doing a release
 
 In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form:
 
-    $ # edit setup.py to bump the version number
+    $ # edit setup.py to bump the version number (ignore)
     $ git tag -a v1.0.1 -m 'v1.0.1 release'
     $ git push origin --tags
     $ ianozsvald-twitter-text-python $ python setup.py sdist register  upload -r http://pypi.python.org/pypi

From aa6bf1acd0b1fa144bf212b184bfb6c947c9fc8f Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Sat, 1 Jun 2013 13:58:50 +0100
Subject: [PATCH 36/38] cleanup

---
 README.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index 8935b66..17b0189 100644
--- a/README.rst
+++ b/README.rst
@@ -105,9 +105,9 @@ Todo
 Doing a release
 ---------------
 
-In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form:
+In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form::
 
-    $ # edit setup.py to bump the version number (ignore)
+    $ # edit setup.py to bump the version number
     $ git tag -a v1.0.1 -m 'v1.0.1 release'
     $ git push origin --tags
     $ ianozsvald-twitter-text-python $ python setup.py sdist register  upload -r http://pypi.python.org/pypi

From 756f947a4322f86337180ba9f498140f74d47f35 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 28 Jul 2014 22:43:40 +0100
Subject: [PATCH 37/38] point to Ed for his support

---
 README.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.rst b/README.rst
index 17b0189..607221c 100644
--- a/README.rst
+++ b/README.rst
@@ -3,6 +3,10 @@ twitter-text-python
 
 **twitter-text-python** is a Tweet parser and formatter for Python. Extract users, hashtags, URLs and format as HTML for display.
 
+----
+**UPDATE** this project is _now maintained by Ed Burnett_. Please go here for the active version: https://github.com/edburnett/twitter-text-python
+----
+
 It is based on twitter-text-java_ and did pass all the unittests of 
 twitter-text-conformance_ plus some additional ones. Note that the conformance tests are now behind (easy PR for someone to work on: https://github.com/ianozsvald/twitter-text-python/issues/5 ):
 
@@ -56,6 +60,8 @@ To use the shortlink follower:
 Installation
 ------------
 
+**NOTE** this version (Ian's) is no longer maintained, see Ed's active version instead: https://github.com/edburnett/twitter-text-python
+
 pip and easy_install will do the job::
 
     # via: http://pypi.python.org/pypi/twitter-text-python

From 13f4990cd5e1c8b6b424ac867fb7d72a8e0aa330 Mon Sep 17 00:00:00 2001
From: Ian Ozsvald <ian@ianozsvald.com>
Date: Mon, 28 Jul 2014 22:44:41 +0100
Subject: [PATCH 38/38] point to Ed for his support

---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 607221c..2ed8cf0 100644
--- a/README.rst
+++ b/README.rst
@@ -4,7 +4,7 @@ twitter-text-python
 **twitter-text-python** is a Tweet parser and formatter for Python. Extract users, hashtags, URLs and format as HTML for display.
 
 ----
-**UPDATE** this project is _now maintained by Ed Burnett_. Please go here for the active version: https://github.com/edburnett/twitter-text-python
+**UPDATE** this project is now maintained by Ed Burnett, please go here for the active version: https://github.com/edburnett/twitter-text-python
 ----
 
 It is based on twitter-text-java_ and did pass all the unittests of