From 82cf8641060725ccf5e4e00e6cc3b60191409e2c Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Fri, 7 Sep 2012 14:51:44 -0700 Subject: [PATCH 01/38] Removed list(set(...)) de-duplicate operations in ParseResults.__init__ as they destory the ordering of urls, users etc in the tweet. The list(set( operation on replies was dangerous as reply was a string not a list (so the string was split into a list of set elements of characters). Removed lots of non-pep8 whitespace --- tests.py | 218 +++++++++++++++++++++++++++---------------------------- ttp.py | 103 +++++++++++++------------- 2 files changed, 160 insertions(+), 161 deletions(-) diff --git a/tests.py b/tests.py index e084abc..4eeb30a 100644 --- a/tests.py +++ b/tests.py @@ -24,8 +24,8 @@ class TWPTests(unittest.TestCase): def setUp(self): self.parser = ttp.Parser() - - + + # General Tests ------------------------------------------------------------ # -------------------------------------------------------------------------- def test_all_not_allow_amp_without_question(self): @@ -33,516 +33,516 @@ def test_all_not_allow_amp_without_question(self): self.assertEqual(result.html, u'Check out: http://www.github.com/test&@username') self.assertEqual(result.users, [u'username']) self.assertEqual(result.urls, [u'http://www.github.com/test']) - + def test_all_not_break_url_at(self): result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406') self.assertEqual(result.html, u'http://www.flickr.com/photo...') self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406']) - - + + # URL tests ---------------------------------------------------------------- # -------------------------------------------------------------------------- def test_url_mid(self): result = self.parser.parse(u'text http://example.com more text') self.assertEqual(result.html, u'text http://example.com more text') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_unicode(self): result = self.parser.parse(u'I enjoy Macintosh Brand computers: http://✪df.ws/ejp') self.assertEqual(result.html, u'I enjoy Macintosh Brand computers: http://✪df.ws/ejp') self.assertEqual(result.urls, [u'http://\u272adf.ws/ejp']) - + def test_url_parentheses(self): result = self.parser.parse(u'text (http://example.com)') self.assertEqual(result.html, u'text (http://example.com)') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_underscore(self): result = self.parser.parse(u'text http://example.com/test/foo_123.jpg') self.assertEqual(result.html, u'text http://example.com/test/foo...') self.assertEqual(result.urls, [u'http://example.com/test/foo_123.jpg']) - + def test_url_underscore_dot(self): result = self.parser.parse(u'text http://example.com/test/bla.net_foo_123.jpg') self.assertEqual(result.html, u'text http://example.com/test/bla...') self.assertEqual(result.urls, [u'http://example.com/test/bla.net_foo_123.jpg']) - + def test_url_amp_lang_equals(self): result = self.parser.parse(u'Check out http://search.twitter.com/search?q=avro&lang=en') self.assertEqual(result.html, u'Check out http://search.twitter.com/s...') self.assertEqual(result.urls, [u'http://search.twitter.com/search?q=avro&lang=en']) - + def test_url_amp_break(self): result = self.parser.parse(u'Check out http://twitter.com/te?foo&invalid=True') self.assertEqual(result.html, u'Check out http://twitter.com/te?foo...') self.assertEqual(result.urls, [u'http://twitter.com/te?foo&invalid=True']) - + def test_url_dash(self): result = self.parser.parse(u'Is www.foo-bar.com a valid URL?') self.assertEqual(result.html, u'Is www.foo-bar.com a valid URL?') self.assertEqual(result.urls, [u'www.foo-bar.com']) - + def test_url_multiple(self): result = self.parser.parse(u'http://example.com https://sslexample.com http://sub.example.com') self.assertEqual(result.html, u'http://example.com https://sslexample.com http://sub.example.com') self.assertEqual(result.urls, [u'http://example.com', u'https://sslexample.com', u'http://sub.example.com']) - + def test_url_raw_domain(self): result = self.parser.parse(u'See http://example.com example.com') self.assertEqual(result.html, u'See http://example.com example.com') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_embed_link(self): result = self.parser.parse(u'http://example.com') self.assertEqual(result.html, u'http://example.com') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_trailing(self): result = self.parser.parse(u'text http://example.com') self.assertEqual(result.html, u'text http://example.com') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_japanese(self): result = self.parser.parse(u'いまなにしてるhttp://example.comいまなにしてる') self.assertEqual(result.html, u'いまなにしてるhttp://example.comいまなにしてる') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_lots_of_punctuation(self): result = self.parser.parse(u'text http://xo.com/~matthew+%-,.;x') self.assertEqual(result.html, u'text http://xo.com/~matthew+%-,.;x') self.assertEqual(result.urls, [u'http://xo.com/~matthew+%-,.;x']) - + def test_url_question_numbers(self): result = self.parser.parse(u'text http://example.com/?77e8fd') self.assertEqual(result.html, u'text http://example.com/?77e8fd') self.assertEqual(result.urls, [u'http://example.com/?77e8fd']) - + def test_url_one_letter_other(self): result = self.parser.parse(u'text http://u.nu/') self.assertEqual(result.html, u'text http://u.nu/') self.assertEqual(result.urls, [u'http://u.nu/']) - + result = self.parser.parse(u'text http://u.tv/') self.assertEqual(result.html, u'text http://u.tv/') self.assertEqual(result.urls, [u'http://u.tv/']) - + def test_url_one_letter_iana(self): result = self.parser.parse(u'text http://x.com/') self.assertEqual(result.html, u'text http://x.com/') self.assertEqual(result.urls, [u'http://x.com/']) - + result = self.parser.parse(u'text http://Q.com/') self.assertEqual(result.html, u'text http://Q.com/') self.assertEqual(result.urls, [u'http://Q.com/']) - + result = self.parser.parse(u'text http://z.com/') self.assertEqual(result.html, u'text http://z.com/') self.assertEqual(result.urls, [u'http://z.com/']) - + result = self.parser.parse(u'text http://i.net/') self.assertEqual(result.html, u'text http://i.net/') self.assertEqual(result.urls, [u'http://i.net/']) - + result = self.parser.parse(u'text http://q.net/') self.assertEqual(result.html, u'text http://q.net/') self.assertEqual(result.urls, [u'http://q.net/']) - + result = self.parser.parse(u'text http://X.org/') self.assertEqual(result.html, u'text http://X.org/') self.assertEqual(result.urls, [u'http://X.org/']) - + def test_url_long_hypens(self): result = self.parser.parse(u'text http://word-and-a-number-8-ftw.domain.tld/') self.assertEqual(result.html, u'text http://word-and-a-number-8-...') self.assertEqual(result.urls, [u'http://word-and-a-number-8-ftw.domain.tld/']) - - + + # URL not tests ------------------------------------------------------------ def test_not_url_dotdotdot(self): result = self.parser.parse(u'Is www...foo a valid URL?') self.assertEqual(result.html, u'Is www...foo a valid URL?') self.assertEqual(result.urls, []) - + def test_not_url_dash(self): result = self.parser.parse(u'Is www.-foo.com a valid URL?') self.assertEqual(result.html, u'Is www.-foo.com a valid URL?') self.assertEqual(result.urls, []) - + def test_not_url_no_tld(self): result = self.parser.parse(u'Is http://no-tld a valid URL?') self.assertEqual(result.html, u'Is http://no-tld a valid URL?') self.assertEqual(result.urls, []) - + def test_not_url_tld_too_short(self): result = self.parser.parse(u'Is http://tld-too-short.x a valid URL?') self.assertEqual(result.html, u'Is http://tld-too-short.x a valid URL?') self.assertEqual(result.urls, []) - + def test_all_not_break_url_at(self): result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406') self.assertEqual(result.html, u'http://www.flickr.com/photo...') self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406']) - + def test_not_url_one_letter_iana(self): result = self.parser.parse(u'text http://a.com/ http://a.net/ http://a.org/') self.assertEqual(result.html, u'text http://a.com/ http://a.net/ http://a.org/') self.assertEqual(result.urls, []) - - + + # URL followed Tests ------------------------------------------------------- def test_url_followed_question(self): result = self.parser.parse(u'text http://example.com?') self.assertEqual(result.html, u'text http://example.com?') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_colon(self): result = self.parser.parse(u'text http://example.com:') self.assertEqual(result.html, u'text http://example.com:') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_curly_brace(self): result = self.parser.parse(u'text http://example.com}') self.assertEqual(result.html, u'text http://example.com}') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_single_quote(self): result = self.parser.parse(u'text http://example.com') self.assertEqual(result.html, u'text http://example.com') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_dot(self): result = self.parser.parse(u'text http://example.com.') self.assertEqual(result.html, u'text http://example.com.') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_exclamation(self): result = self.parser.parse(u'text http://example.com!') self.assertEqual(result.html, u'text http://example.com!') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_comma(self): result = self.parser.parse(u'text http://example.com,') self.assertEqual(result.html, u'text http://example.com,') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_brace(self): result = self.parser.parse(u'text http://example.com)') self.assertEqual(result.html, u'text http://example.com)') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_big_brace(self): result = self.parser.parse(u'text http://example.com]') self.assertEqual(result.html, u'text http://example.com]') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_equals(self): result = self.parser.parse(u'text http://example.com=') self.assertEqual(result.html, u'text http://example.com=') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_semicolon(self): result = self.parser.parse(u'text http://example.com;') self.assertEqual(result.html, u'text http://example.com;') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_hypen(self): result = self.parser.parse(u'text http://domain.tld-that-you-should-have-put-a-space-after') self.assertEqual(result.html, u'text http://domain.tld-that-you-should-have-put-a-space-after') self.assertEqual(result.urls, [u'http://domain.tld']) - - + + # URL preceeded Tests ------------------------------------------------------- def test_url_preceeded_colon(self): result = self.parser.parse(u'text:http://example.com') self.assertEqual(result.html, u'text:http://example.com') self.assertEqual(result.urls, [u'http://example.com']) - + def test_not_url_preceeded_equals(self): result = self.parser.parse(u'text =http://example.com') self.assertEqual(result.html, u'text =http://example.com') self.assertEqual(result.urls, []) - + # NOT def test_not_url_preceeded_forwardslash(self): result = self.parser.parse(u'text /http://example.com') self.assertEqual(result.html, u'text /http://example.com') self.assertEqual(result.urls, []) - + def test_not_url_preceeded_exclamation(self): result = self.parser.parse(u'text !http://example.com') self.assertEqual(result.html, u'text !http://example.com') self.assertEqual(result.urls, []) - - + + # URL numeric tests -------------------------------------------------------- def test_url_at_numeric(self): result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406') self.assertEqual(result.html, u'http://www.flickr.com/photo...') self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406']) - + def test_url_at_non_numeric(self): result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/foobar') self.assertEqual(result.html, u'http://www.flickr.com/photo...') self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/foobar']) - - + + # URL domain tests --------------------------------------------------------- def test_url_WWW(self): result = self.parser.parse(u'WWW.EXAMPLE.COM') self.assertEqual(result.html, u'WWW.EXAMPLE.COM') self.assertEqual(result.urls, [u'WWW.EXAMPLE.COM']) - + def test_url_www(self): result = self.parser.parse(u'www.example.com') self.assertEqual(result.html, u'www.example.com') self.assertEqual(result.urls, [u'www.example.com']) - + def test_url_only_domain_query_followed_period(self): result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.i.want.it. Even when they contain a URL.') self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.... Even when they contain a URL.') self.assertEqual(result.urls, [u'http://tell.me/why?=because.i.want.it']) - + def test_url_only_domain_followed_period(self): result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me. Even when they contain a URL.') self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period http://tell.me. Even when they contain a URL.') self.assertEqual(result.urls, [u'http://tell.me']) - + def test_url_only_domain_path_followed_period(self): result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why. Even when they contain a URL.') self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period http://tell.me/why. Even when they contain a URL.') self.assertEqual(result.urls, [u'http://tell.me/why']) - + def test_url_long_tld(self): result = self.parser.parse(u'http://example.mobi/path') self.assertEqual(result.html, u'http://example.mobi/path') self.assertEqual(result.urls, [u'http://example.mobi/path']) - + def test_url_multiple_protocols(self): result = self.parser.parse(u'http://foo.com AND https://bar.com AND www.foobar.com') self.assertEqual(result.html, u'http://foo.com AND https://bar.com AND www.foobar.com') self.assertEqual(result.urls, [u'http://foo.com', u'https://bar.com', u'www.foobar.com']) - + # NOT def test_not_url_exclamation_domain(self): result = self.parser.parse(u'badly formatted http://foo!bar.com') self.assertEqual(result.html, u'badly formatted http://foo!bar.com') self.assertEqual(result.urls, []) - + def test_not_url_under_domain(self): result = self.parser.parse(u'badly formatted http://foo_bar.com') self.assertEqual(result.html, u'badly formatted http://foo_bar.com') self.assertEqual(result.urls, []) - - + + # Hashtag tests ------------------------------------------------------------ # -------------------------------------------------------------------------- def test_hashtag_followed_full_whitespace(self): result = self.parser.parse(u'#hashtag text') self.assertEqual(result.html, u'#hashtag text') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_followed_full_hash(self): result = self.parser.parse(u'#hashtag') self.assertEqual(result.html, u'#hashtag') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_preceeded_full_whitespace(self): result = self.parser.parse(u'text #hashtag') self.assertEqual(result.html, u'text #hashtag') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_number(self): result = self.parser.parse(u'text #1tag') self.assertEqual(result.html, u'text #1tag') self.assertEqual(result.tags, [u'1tag']) - + def test_not_hashtag_escape(self): result = self.parser.parse(u'&#nbsp;') self.assertEqual(result.html, u'&#nbsp;') self.assertEqual(result.tags, []) - + def test_hashtag_japanese(self): result = self.parser.parse(u'text #hashtagの') self.assertEqual(result.html, u'text #hashtagの') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_period(self): result = self.parser.parse(u'text.#hashtag') self.assertEqual(result.html, u'text.#hashtag') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_trailing(self): result = self.parser.parse(u'text #hashtag') self.assertEqual(result.html, u'text #hashtag') self.assertEqual(result.tags, [u'hashtag']) - + def test_not_hashtag_exclamation(self): result = self.parser.parse(u'text #hashtag!') self.assertEqual(result.html, u'text #hashtag!') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_multiple(self): result = self.parser.parse(u'text #hashtag1 #hashtag2') self.assertEqual(result.html, u'text #hashtag1 #hashtag2') self.assertEqual(result.tags, [u'hashtag1', u'hashtag2']) - + def test_not_hashtag_number(self): result = self.parser.parse(u'text #1234') self.assertEqual(result.html, u'text #1234') self.assertEqual(result.tags, []) - + def test_not_hashtag_text(self): result = self.parser.parse(u'text#hashtag') self.assertEqual(result.html, u'text#hashtag') self.assertEqual(result.tags, []) - + def test_hashtag_umlaut(self): result = self.parser.parse(u'text #hash_tagüäö') self.assertEqual(result.html, u'text #hash_tagüäö') self.assertEqual(result.tags, [u'hash_tag\xfc\xe4\xf6']) - + def test_hashtag_alpha(self): result = self.parser.parse(u'text #hash0tag') self.assertEqual(result.html, u'text #hash0tag') self.assertEqual(result.tags, [u'hash0tag']) - + def test_hashtag_under(self): result = self.parser.parse(u'text #hash_tag') self.assertEqual(result.html, u'text #hash_tag') self.assertEqual(result.tags, [u'hash_tag']) - - + + # Username tests ----------------------------------------------------------- # -------------------------------------------------------------------------- def test_not_username_preceded_letter(self): result = self.parser.parse(u'meet@the beach') self.assertEqual(result.html, u'meet@the beach') self.assertEqual(result.users, []) - + def test_username_preceded_punctuation(self): result = self.parser.parse(u'.@username') self.assertEqual(result.html, u'.@username') self.assertEqual(result.users, [u'username']) - + def test_username_preceded_japanese(self): result = self.parser.parse(u'あ@username') self.assertEqual(result.html, u'あ@username') self.assertEqual(result.users, [u'username']) - + def test_username_followed_japanese(self): result = self.parser.parse(u'@usernameの') self.assertEqual(result.html, u'@usernameの') self.assertEqual(result.users, [u'username']) - + def test_username_surrounded_japanese(self): result = self.parser.parse(u'あ@usernameの') self.assertEqual(result.html, u'あ@usernameの') self.assertEqual(result.users, [u'username']) - + def test_username_followed_punctuation(self): result = self.parser.parse(u'@username&^$%^') self.assertEqual(result.html, u'@username&^$%^') self.assertEqual(result.users, [u'username']) - + def test_not_username_spaced(self): result = self.parser.parse(u'@ username') self.assertEqual(result.html, u'@ username') self.assertEqual(result.users, []) - + def test_username_beginning(self): result = self.parser.parse(u'@username text') self.assertEqual(result.html, u'@username text') self.assertEqual(result.users, [u'username']) - + def test_username_to_long(self): result = self.parser.parse(u'@username9012345678901') self.assertEqual(result.html, u'@username9012345678901') self.assertEqual(result.users, [u'username901234567890']) - + def test_username_full_at_sign(self): result = self.parser.parse(u'@username') self.assertEqual(result.html, u'@username') self.assertEqual(result.users, [u'username']) - + def test_username_trailing(self): result = self.parser.parse(u'text @username') self.assertEqual(result.html, u'text @username') self.assertEqual(result.users, [u'username']) - + # Replies def test_username_reply_simple(self): result = self.parser.parse(u'@username') self.assertEqual(result.html, u'@username') self.assertEqual(result.users, [u'username']) self.assertEqual(result.reply, u'username') - + def test_username_reply_whitespace(self): result = self.parser.parse(u' @username') self.assertEqual(result.html, u' @username') self.assertEqual(result.users, [u'username']) self.assertEqual(result.reply, u'username') - + def test_username_reply_full(self): result = self.parser.parse(u' @username') self.assertEqual(result.html, u' @username') self.assertEqual(result.users, [u'username']) self.assertEqual(result.reply, u'username') - + def test_username_non_reply(self): result = self.parser.parse(u'test @username') self.assertEqual(result.html, u'test @username') self.assertEqual(result.users, [u'username']) self.assertEqual(result.reply, None) - - + + # List tests --------------------------------------------------------------- # -------------------------------------------------------------------------- def test_list_preceeded(self): result = self.parser.parse(u'text @username/list') self.assertEqual(result.html, u'text @username/list') self.assertEqual(result.lists, [(u'username', u'list')]) - + def test_list_beginning(self): result = self.parser.parse(u'@username/list') self.assertEqual(result.html, u'@username/list') self.assertEqual(result.lists, [(u'username', u'list')]) - + def test_list_preceeded_punctuation(self): result = self.parser.parse(u'.@username/list') self.assertEqual(result.html, u'.@username/list') self.assertEqual(result.lists, [(u'username', u'list')]) - + def test_list_followed_punctuation(self): result = self.parser.parse(u'@username/list&^$%^') self.assertEqual(result.html, u'@username/list&^$%^') self.assertEqual(result.lists, [(u'username', u'list')]) - + def test_list_not_slash_space(self): result = self.parser.parse(u'@username/ list') self.assertEqual(result.html, u'@username/ list') self.assertEqual(result.users, [u'username']) self.assertEqual(result.lists, []) - + def test_list_beginning(self): result = self.parser.parse(u'@username/list') self.assertEqual(result.html, u'@username/list') self.assertEqual(result.lists, [(u'username', u'list')]) - + def test_list_not_empty_username(self): result = self.parser.parse(u'text @/list') self.assertEqual(result.html, u'text @/list') self.assertEqual(result.lists, []) - + def test_list_not_preceeded_letter(self): result = self.parser.parse(u'meet@the/beach') self.assertEqual(result.html, u'meet@the/beach') self.assertEqual(result.lists, []) - + def test_list_long_truncate(self): result = self.parser.parse(u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A') self.assertEqual(result.html, u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A') self.assertEqual(result.lists, [(u'username', u'list5678901234567890123456789012345678901234567890123456789012345678901234567890')]) - + def test_list_with_dash(self): result = self.parser.parse(u'text @username/list-foo') self.assertEqual(result.html, u'text @username/list-foo') diff --git a/ttp.py b/ttp.py index 27102a9..c19b787 100644 --- a/ttp.py +++ b/ttp.py @@ -60,71 +60,70 @@ PATH_ENDING_CHARS, QUERY_CHARS, QUERY_ENDING_CHARS), re.IGNORECASE) - # Registered IANA one letter domains IANA_ONE_LETTER_DOMAINS = ('x.com', 'x.org', 'z.com', 'q.net', 'q.com', 'i.net') class ParseResult(object): '''A class containing the results of a parsed Tweet. - + Attributes: - urls: A list containing all the valid urls in the Tweet. - + - users A list containing all the valid usernames in the Tweet. - + - reply A string containing the username this tweet was a reply to. This only matches a username at the beginning of the Tweet, it may however be preceeded by whitespace. Note: It's generally better to rely on the Tweet JSON/XML in order to find out if it's a reply or not. - + - lists A list containing all the valid lists in the Tweet. Each list item is a tuple in the format (username, listname). - + - tags A list containing all the valid tags in theTweet. - + - html A string containg formatted HTML. To change the formatting sublcass twp.Parser and override the format_* methods. - + ''' - + def __init__(self, urls, users, reply, lists, tags, html): - self.urls = list(set(urls)) if urls else [] #fixes dups - self.users = list(set(users)) if users else [] - self.lists = list(set(lists)) if lists else [] - self.reply = list(set(reply)) if reply else [] - self.tags = list(set(tags)) if tags else [] + self.urls = urls if urls else [] + self.users = users if users else [] + self.lists = lists if lists else [] + self.reply = reply if reply else None + self.tags = tags if tags else [] self.html = html class Parser(object): '''A Tweet Parser''' - + def __init__(self, max_url_length=30): self._max_url_length = max_url_length - + def parse(self, text, html=True): '''Parse the text and return a ParseResult instance.''' self._urls = [] self._users = [] self._lists = [] self._tags = [] - + reply = REPLY_REGEX.match(text) reply = reply.groups(0)[0] if reply is not None else None - + parsed_html = self._html(text) if html else self._text(text) return ParseResult(self._urls, self._users, reply, self._lists, self._tags, parsed_html) - + def _text(self, text): '''Parse a Tweet without generating HTML.''' URL_REGEX.sub(self._parse_urls, text) @@ -132,84 +131,84 @@ def _text(self, text): LIST_REGEX.sub(self._parse_lists, text) HASHTAG_REGEX.sub(self._parse_tags, text) return None - + def _html(self, text): '''Parse a Tweet and generate HTML.''' html = URL_REGEX.sub(self._parse_urls, text) html = USERNAME_REGEX.sub(self._parse_users, html) html = LIST_REGEX.sub(self._parse_lists, html) return HASHTAG_REGEX.sub(self._parse_tags, html) - - + + # Internal parser stuff ---------------------------------------------------- def _parse_urls(self, match): '''Parse URLs.''' - + mat = match.group(0) - + # Fix a bug in the regex concerning www...com and www.-foo.com domains # TODO fix this in the regex instead of working around it here domain = match.group(5) if domain[0] in '.-': return mat - + # Only allow IANA one letter domains that are actually registered if len(domain) == 5 \ and domain[-4:].lower() in ('.com', '.org', '.net') \ and not domain.lower() in IANA_ONE_LETTER_DOMAINS: - + return mat - + # Check for urls without http(s) pos = mat.find('http') if pos != -1: pre, url = mat[:pos], mat[pos:] full_url = url - + # Find the www and force http:// else: pos = mat.lower().find('www') pre, url = mat[:pos], mat[pos:] full_url = 'http://%s' % url - + self._urls.append(url) - + if self._html: return '%s%s' % (pre, self.format_url(full_url, self._shorten_url(escape(url)))) - + def _parse_users(self, match): '''Parse usernames.''' - + # Don't parse lists here if match.group(2) is not None: return match.group(0) - + mat = match.group(0) self._users.append(mat[1:]) - + if self._html: return self.format_username(mat[0:1], mat[1:]) - + def _parse_lists(self, match): '''Parse lists.''' - + # Don't parse usernames here if match.group(4) is None: return match.group(0) - + pre, at_char, user, list_name = match.groups() list_name = list_name[1:] self._lists.append((user, list_name)) - + if self._html: return '%s%s' % (pre, self.format_list(at_char, user, list_name)) - + def _parse_tags(self, match): '''Parse hashtags.''' - + mat = match.group(0) - + # Fix problems with the regex capturing stuff infront of the # tag = None for i in u'#\uff03': @@ -217,45 +216,45 @@ def _parse_tags(self, match): if pos != -1: tag = i break - + pre, text = mat[:pos], mat[pos + 1:] self._tags.append(text) - + if self._html: return '%s%s' % (pre, self.format_tag(tag, text)) - + def _shorten_url(self, text): '''Shorten a URL and make sure to not cut of html entities.''' - + if len(text) > self._max_url_length and self._max_url_length != -1: text = text[0:self._max_url_length - 3] amp = text.rfind('&') close = text.rfind(';') if amp != -1 and (close == -1 or close < amp): text = text[0:amp] - + return text + '...' - + else: return text - - + + # User defined formatters -------------------------------------------------- def format_tag(self, tag, text): '''Return formatted HTML for a hashtag.''' return '%s%s' \ % (urllib.quote('#' + text.encode('utf-8')), tag, text) - + def format_username(self, at_char, user): '''Return formatted HTML for a username.''' return '%s%s' \ % (user, at_char, user) - + def format_list(self, at_char, user, list_name): '''Return formatted HTML for a list.''' return '%s%s/%s' \ % (user, list_name, at_char, user, list_name) - + def format_url(self, url, text): '''Return formatted HTML for a url.''' return '%s' % (escape(url), text) From 71b793a35a6bdbb903a2b06c0e41975e69b87cc5 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Fri, 7 Sep 2012 15:05:52 -0700 Subject: [PATCH 02/38] Applied schwa's span addition https://github.com/schwa/twitter-text-python/commit/b81cef33a6fc12c837936d60a0b4a86222d45a4f to add option to extract span for matched parts of message for URLs, users etc --- ttp.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/ttp.py b/ttp.py index c19b787..98a8844 100644 --- a/ttp.py +++ b/ttp.py @@ -107,8 +107,9 @@ def __init__(self, urls, users, reply, lists, tags, html): class Parser(object): '''A Tweet Parser''' - def __init__(self, max_url_length=30): + def __init__(self, max_url_length=30, include_spans = False): self._max_url_length = max_url_length + self._include_spans = include_spans def parse(self, text, html=True): '''Parse the text and return a ParseResult instance.''' @@ -171,7 +172,10 @@ def _parse_urls(self, match): pre, url = mat[:pos], mat[pos:] full_url = 'http://%s' % url - self._urls.append(url) + if self._include_spans: + self._urls.append((url, match.span(0))) + else: + self._urls.append(url) if self._html: return '%s%s' % (pre, self.format_url(full_url, @@ -185,7 +189,10 @@ def _parse_users(self, match): return match.group(0) mat = match.group(0) - self._users.append(mat[1:]) + if self._include_spans: + self._users.append((mat[1:], match.span(0))) + else: + self._users.append(mat[1:]) if self._html: return self.format_username(mat[0:1], mat[1:]) @@ -199,7 +206,10 @@ def _parse_lists(self, match): pre, at_char, user, list_name = match.groups() list_name = list_name[1:] - self._lists.append((user, list_name)) + if self._include_spans: + self._lists.append((user, list_name, match.span(0))) + else: + self._lists.append((user, list_name)) if self._html: return '%s%s' % (pre, self.format_list(at_char, user, list_name)) @@ -218,7 +228,10 @@ def _parse_tags(self, match): break pre, text = mat[:pos], mat[pos + 1:] - self._tags.append(text) + if self._include_spans: + self._tags.append((text, match.span(0))) + else: + self._tags.append(text) if self._html: return '%s%s' % (pre, self.format_tag(tag, text)) From ff5a0c024ff2c4ced47f14fee007247913cc1888 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Fri, 7 Sep 2012 16:00:08 -0700 Subject: [PATCH 03/38] added span tests as a separate class --- tests.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tests.py b/tests.py index 4eeb30a..c409443 100644 --- a/tests.py +++ b/tests.py @@ -549,7 +549,33 @@ def test_list_with_dash(self): self.assertEqual(result.lists, [(u'username', u'list-foo')]) +class TWPTestsWithSpans(unittest.TestCase): + """Test ttp with re spans to extract character co-ords of matches""" + def setUp(self): + self.parser = ttp.Parser(include_spans = True) + + def test_spans_in_tweets(self): + """Test some coca-cola tweets taken from twitter with spans""" + result = self.parser.parse(u'Coca-Cola Hits 50 Million Facebook Likes http://bit.ly/QlKOc7') + self.assertEqual(result.urls, [('http://bit.ly/QlKOc7', (40, 61))]) + + result = self.parser.parse(u' #ABillionReasonsToBelieveInAfrica ARISE MAG.FASHION WEEK NY! Tsemaye B,Maki Oh,Tiffany Amber, Ozwald.Showin NY reasons2beliv @CocaCola_NG') + self.assertEqual(result.urls, []) + self.assertEqual(result.tags, [(u'ABillionReasonsToBelieveInAfrica', (0, 34))]) + self.assertEqual(result.users, [(u'CocaCola_NG', (126, 138))]) + + result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA') + self.assertEqual(result.urls, [(u'http://bit.ly/EANCAA', (94, 115))]) + self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72))]) + self.assertEqual(result.tags, [(u'GameOn', (207, 215)), (u'ad', (215, 219))]) + + # Test it! if __name__ == '__main__': - unittest.main() + #unittest.main() # only seems to run 1 class? + verbosity = 0 # set to 2 for verbose output + suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans) + unittest.TextTestRunner(verbosity=verbosity).run(suite) + suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests) + unittest.TextTestRunner(verbosity=verbosity).run(suite) From a202185a6cf45a6e07b0b7eacf00e4b1da0dc19e Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Fri, 7 Sep 2012 16:03:01 -0700 Subject: [PATCH 04/38] not sure what happened, unittest.main() does the job now --- tests.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests.py b/tests.py index c409443..4c62778 100644 --- a/tests.py +++ b/tests.py @@ -572,10 +572,10 @@ def test_spans_in_tweets(self): # Test it! if __name__ == '__main__': - #unittest.main() # only seems to run 1 class? + unittest.main() # only seems to run 1 class? - verbosity = 0 # set to 2 for verbose output - suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans) - unittest.TextTestRunner(verbosity=verbosity).run(suite) - suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests) - unittest.TextTestRunner(verbosity=verbosity).run(suite) + #verbosity = 0 # set to 2 for verbose output + #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans) + #unittest.TextTestRunner(verbosity=verbosity).run(suite) + #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests) + #unittest.TextTestRunner(verbosity=verbosity).run(suite) From 90fbc84d244a5445b005d679280077d183f05e5f Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Sun, 9 Sep 2012 16:14:44 -0700 Subject: [PATCH 05/38] added test for hash and comma in URL --- tests.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests.py b/tests.py index 4c62778..3ba1c90 100644 --- a/tests.py +++ b/tests.py @@ -28,6 +28,12 @@ def setUp(self): # General Tests ------------------------------------------------------------ # -------------------------------------------------------------------------- + def test_urls(self): + """Confirm that # in a URL works along with ,""" + result = self.parser.parse(u'big url: http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag') + self.assertEqual(result.urls, [u'http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2']) + self.assertEqual(result.tags, [u'ahashtag']) + def test_all_not_allow_amp_without_question(self): result = self.parser.parse(u'Check out: http://www.github.com/test&@username') self.assertEqual(result.html, u'Check out: http://www.github.com/test&@username') From b25880ab09f21927b177ab8af8560cdf3b1a9474 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Sun, 9 Sep 2012 23:55:43 -0700 Subject: [PATCH 06/38] uncovered two name-shielded tests and renamed, now also using non-html text for the span=True tests --- tests.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests.py b/tests.py index 3ba1c90..c839b53 100644 --- a/tests.py +++ b/tests.py @@ -184,7 +184,7 @@ def test_not_url_tld_too_short(self): self.assertEqual(result.html, u'Is http://tld-too-short.x a valid URL?') self.assertEqual(result.urls, []) - def test_all_not_break_url_at(self): + def test_all_not_break_url_at2(self): result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406') self.assertEqual(result.html, u'http://www.flickr.com/photo...') self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406']) @@ -529,7 +529,7 @@ def test_list_not_slash_space(self): self.assertEqual(result.users, [u'username']) self.assertEqual(result.lists, []) - def test_list_beginning(self): + def test_list_beginning2(self): result = self.parser.parse(u'@username/list') self.assertEqual(result.html, u'@username/list') self.assertEqual(result.lists, [(u'username', u'list')]) @@ -565,16 +565,19 @@ def test_spans_in_tweets(self): result = self.parser.parse(u'Coca-Cola Hits 50 Million Facebook Likes http://bit.ly/QlKOc7') self.assertEqual(result.urls, [('http://bit.ly/QlKOc7', (40, 61))]) - result = self.parser.parse(u' #ABillionReasonsToBelieveInAfrica ARISE MAG.FASHION WEEK NY! Tsemaye B,Maki Oh,Tiffany Amber, Ozwald.Showin NY reasons2beliv @CocaCola_NG') + result = self.parser.parse(u' #ABillionReasonsToBelieveInAfrica ARISE MAG.FASHION WEEK NY! Tsemaye B,Maki Oh,Tiffany Amber, Ozwald.Showin NY reasons2beliv @CocaCola_NG', html=False) self.assertEqual(result.urls, []) self.assertEqual(result.tags, [(u'ABillionReasonsToBelieveInAfrica', (0, 34))]) self.assertEqual(result.users, [(u'CocaCola_NG', (126, 138))]) - result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA') + result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA', html=False) self.assertEqual(result.urls, [(u'http://bit.ly/EANCAA', (94, 115))]) self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72))]) - self.assertEqual(result.tags, [(u'GameOn', (207, 215)), (u'ad', (215, 219))]) + self.assertEqual(result.tags, [(u'GameOn', (75, 83)), (u'ad', (83, 87))]) + def test_users_in_tweets(self): + result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA @someone', html=False) + self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72)), (u'someone', (116, 124))]) # Test it! if __name__ == '__main__': From 536ba80fdd50815adc9487a88aa416bd447ae9d9 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 10 Sep 2012 00:39:50 -0700 Subject: [PATCH 07/38] removed off-by-one offset for URL and hashtag matcher if a pre character e.g. space exists --- tests.py | 25 ++++++++++++++++++++----- ttp.py | 10 ++++++++-- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/tests.py b/tests.py index c839b53..26fb373 100644 --- a/tests.py +++ b/tests.py @@ -563,27 +563,42 @@ def setUp(self): def test_spans_in_tweets(self): """Test some coca-cola tweets taken from twitter with spans""" result = self.parser.parse(u'Coca-Cola Hits 50 Million Facebook Likes http://bit.ly/QlKOc7') - self.assertEqual(result.urls, [('http://bit.ly/QlKOc7', (40, 61))]) + self.assertEqual(result.urls, [('http://bit.ly/QlKOc7', (41, 61))]) result = self.parser.parse(u' #ABillionReasonsToBelieveInAfrica ARISE MAG.FASHION WEEK NY! Tsemaye B,Maki Oh,Tiffany Amber, Ozwald.Showin NY reasons2beliv @CocaCola_NG', html=False) self.assertEqual(result.urls, []) - self.assertEqual(result.tags, [(u'ABillionReasonsToBelieveInAfrica', (0, 34))]) + self.assertEqual(result.tags, [(u'ABillionReasonsToBelieveInAfrica', (1, 34))]) self.assertEqual(result.users, [(u'CocaCola_NG', (126, 138))]) result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA', html=False) - self.assertEqual(result.urls, [(u'http://bit.ly/EANCAA', (94, 115))]) + self.assertEqual(result.urls, [(u'http://bit.ly/EANCAA', (95, 115))]) self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72))]) - self.assertEqual(result.tags, [(u'GameOn', (75, 83)), (u'ad', (83, 87))]) + self.assertEqual(result.tags, [(u'GameOn', (76, 83)), (u'ad', (84, 87))]) def test_users_in_tweets(self): result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA @someone', html=False) self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72)), (u'someone', (116, 124))]) + def test_edge_cases(self): + """Some edge cases that upset the original version of ttp""" + result = self.parser.parse(u' @user', html=False) + self.assertEqual(result.users, [(u'user', (1, 6))]) + + result = self.parser.parse(u' #hash ', html=False) + self.assertEqual(result.tags, [(u'hash', (1, 6))]) + + result = self.parser.parse(u' http://some.com ', html=False) + self.assertEqual(result.urls, [(u'http://some.com', (1, 16))]) + + + # Test it! if __name__ == '__main__': - unittest.main() # only seems to run 1 class? + unittest.main() #verbosity = 0 # set to 2 for verbose output + #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpansEdgeCases) + #unittest.TextTestRunner(verbosity=verbosity).run(suite) #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans) #unittest.TextTestRunner(verbosity=verbosity).run(suite) #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests) diff --git a/ttp.py b/ttp.py index 98a8844..b4552b3 100644 --- a/ttp.py +++ b/ttp.py @@ -173,7 +173,10 @@ def _parse_urls(self, match): full_url = 'http://%s' % url if self._include_spans: - self._urls.append((url, match.span(0))) + span = match.span(0) + # add an offset if pre is e.g. ' ' + span = (span[0] + len(pre), span[1]) + self._urls.append((url, span)) else: self._urls.append(url) @@ -229,7 +232,10 @@ def _parse_tags(self, match): pre, text = mat[:pos], mat[pos + 1:] if self._include_spans: - self._tags.append((text, match.span(0))) + span = match.span(0) + # add an offset if pre is e.g. ' ' + span = (span[0] + len(pre), span[1]) + self._tags.append((text, span)) else: self._tags.append(text) From a8c77dcbc1e04429f9159c6080b78f372e5845fd Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Wed, 12 Sep 2012 21:25:37 -0700 Subject: [PATCH 08/38] added reference to the original project --- README.rst | 6 ++++++ ttp.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/README.rst b/README.rst index 207d3a9..06410ff 100644 --- a/README.rst +++ b/README.rst @@ -9,6 +9,12 @@ twitter-text-conformance_ plus some additional ones. .. _twitter-text-java: http://github.com/mzsanford/twitter-text-java .. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance +UPDATE - forked by Ian Ozsvald, some bugs fixed, few minor changes to functionality added: +https://github.com/ianozsvald/twitter-text-python + +The original ttp comes from: +https://github.com/BonsaiDen/twitter-text-python + Usage:: >>> import ttp diff --git a/ttp.py b/ttp.py index b4552b3..8e68bf6 100644 --- a/ttp.py +++ b/ttp.py @@ -15,6 +15,10 @@ # TODO create a setup.py +# Forked by Ian Ozsvald: +# https://github.com/ianozsvald/twitter-text-python +# from: +# https://github.com/BonsaiDen/twitter-text-python # Tweet Parser and Formatter --------------------------------------------------- # ------------------------------------------------------------------------------ From f3095689bd98f9969fe507b0a833c9e7156801cf Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Wed, 12 Sep 2012 21:27:53 -0700 Subject: [PATCH 09/38] changed URL --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 9de7d83..a2d99b1 100644 --- a/setup.py +++ b/setup.py @@ -5,9 +5,9 @@ version='1.0', description='Tweet parser and formatter', long_description=open('README.rst').read(), - author='Ivo Wetzel', + author='Ivo Wetzel (fork by Ian Ozsvald)', author_email='', - url='http://github.com/BonsaiDen/twitter-text-python', + url='https://github.com/ianozsvald/twitter-text-python', license='GPL', py_modules=['ttp'], include_package_data=True, From be4d2e35c1a17ac2a8a8e904c62df25cb964b01f Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Wed, 12 Sep 2012 21:29:15 -0700 Subject: [PATCH 10/38] first --- __init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 __init__.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 From 489ca0461d3cbecd3f78d3098c785325940e298d Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:12:19 +0000 Subject: [PATCH 11/38] preparing for V1.0.0 release --- README.rst | 92 ++++++++++++++++++++++++++++++++++++++++-------------- setup.py | 18 +++++------ ttp.py | 2 ++ 3 files changed, 79 insertions(+), 33 deletions(-) diff --git a/README.rst b/README.rst index 06410ff..7cf7840 100644 --- a/README.rst +++ b/README.rst @@ -1,7 +1,7 @@ twitter-text-python =================== -**twitter-text-python** is a Tweet parser and formatter for Python. +**twitter-text-python** is a Tweet parser and formatter for Python. Extract users, hashtags, URLs and format as HTML for display. It is based on twitter-text-java_ and passes all the unittests of twitter-text-conformance_ plus some additional ones. @@ -9,32 +9,67 @@ twitter-text-conformance_ plus some additional ones. .. _twitter-text-java: http://github.com/mzsanford/twitter-text-java .. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance -UPDATE - forked by Ian Ozsvald, some bugs fixed, few minor changes to functionality added: +This version was forked by Ian Ozsvald in January 2013 and released to PyPI, some bugs were fixed, a few minor changes to functionality added: https://github.com/ianozsvald/twitter-text-python -The original ttp comes from: +PyPI release: +http://pypi.python.org/pypi/twitter-text-python/ + +The original ttp comes from Ivo Wetzel (Ivo's version no longer supported): https://github.com/BonsaiDen/twitter-text-python Usage:: >>> import ttp >>> p = ttp.Parser() - >>> result = p.parse("@BonsaiDen Hey that's a great Tweet parser! #twp") + >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/") >>> result.reply - 'BonsaiDen' + 'ianozsvald' >>> result.users - ['BonsaiDen'] + ['ianozsvald'] >>> result.tags - ['twp'] + ['IvoWertzel'] >>> result.urls - [] + ['https://github.com/ianozsvald/'] >>> result.html - u'@BonsaiDen Hey that\'s a great Tweet Parser! - #twp' - + u'@ianozsvald, you now support #IvoWertzel\'s tweet parser! https://github.com/ianozsvald/' If you need different HTML output just subclass and override the ``format_*`` methods. +You can also ask for the span tags to be returned for each entity:: + + >>> p = ttp.Parser(include_spans=True) + >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/") + >>> result.urls + [('https://github.com/ianozsvald/', (57, 87))] + + + +Installation +------------ + + $ pip install twitter-text-python # via: http://pypi.python.org/pypi/twitter-text-python + $ python + >>> import ttp + >>> ttp.__version__ + '1.0.0' + + +Changelog +--------- + + * 2013/2/11 1.0.0 released to PyPI + + +Tests +----- + + $ python tests.py + ................................................................................................. + ---------------------------------------------------------------------- + Ran 97 tests in 0.009s + OK + Contributing ------------ @@ -43,23 +78,32 @@ The source is available on GitHub_, to contribute to the project, fork it on GitHub and send a pull request. Everyone is welcome to make improvements to **twp**! -.. _GitHub: http://github.com/BonsaiDen/twitter-text-python +.. _GitHub: https://github.com/ianozsvald/twitter-text-python + License -======= +------- + +*MIT* -Copyright (c) 2010 Ivo Wetzel +Copyright (c) 2012 Ivo Wetzel. -**twitter-text-python** is free software: you can redistribute it and/or -modify it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: -**twitter-text-python** is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. -You should have received a copy of the GNU General Public License along with -**twitter-text-python**. If not, see . +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +Copyright (c) 2010-2013 Ivo Wetzel diff --git a/setup.py b/setup.py index a2d99b1..8d1305a 100644 --- a/setup.py +++ b/setup.py @@ -3,21 +3,21 @@ setup( name='twitter-text-python', version='1.0', - description='Tweet parser and formatter', - long_description=open('README.rst').read(), - author='Ivo Wetzel (fork by Ian Ozsvald)', - author_email='', + description='Twitter Tweet parser and formatter', + long_description="no long description", #open('README.rst').read(), + author='Maintained by Ian Ozsvald (originally by Ivo Wetzel)', + author_email='ian@ianozsvald.com', url='https://github.com/ianozsvald/twitter-text-python', - license='GPL', - py_modules=['ttp'], + license='MIT', + py_modules=['ttp', 'tests'], include_package_data=True, zip_safe=False, install_requires=[], + #data_files=[('./', ['README.rst'])], classifiers=[ - 'Environment :: Web Environment', - # I don't know what exactly this means, but why not? + 'Environment :: Console', 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', + #'License :: OSI Approved :: GPL License', 'Operating System :: OS Independent', 'Programming Language :: Python', 'Topic :: Software Development :: Libraries :: Python Modules', diff --git a/ttp.py b/ttp.py index 8e68bf6..b599d6d 100644 --- a/ttp.py +++ b/ttp.py @@ -25,6 +25,8 @@ import re import urllib +__version__ = "1.0.0" + # Some of this code has been translated from the twitter-text-java library: # AT_SIGNS = ur'[@\uff20]' From e2c57a50ad7b2f8efadf398972cf2ce37e34d28f Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:15:42 +0000 Subject: [PATCH 12/38] weird formatting bug --- README.rst | 101 ----------------------------------------------------- 1 file changed, 101 deletions(-) diff --git a/README.rst b/README.rst index 7cf7840..3c04032 100644 --- a/README.rst +++ b/README.rst @@ -6,104 +6,3 @@ twitter-text-python It is based on twitter-text-java_ and passes all the unittests of twitter-text-conformance_ plus some additional ones. -.. _twitter-text-java: http://github.com/mzsanford/twitter-text-java -.. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance - -This version was forked by Ian Ozsvald in January 2013 and released to PyPI, some bugs were fixed, a few minor changes to functionality added: -https://github.com/ianozsvald/twitter-text-python - -PyPI release: -http://pypi.python.org/pypi/twitter-text-python/ - -The original ttp comes from Ivo Wetzel (Ivo's version no longer supported): -https://github.com/BonsaiDen/twitter-text-python - -Usage:: - - >>> import ttp - >>> p = ttp.Parser() - >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/") - >>> result.reply - 'ianozsvald' - >>> result.users - ['ianozsvald'] - >>> result.tags - ['IvoWertzel'] - >>> result.urls - ['https://github.com/ianozsvald/'] - >>> result.html - u'@ianozsvald, you now support #IvoWertzel\'s tweet parser! https://github.com/ianozsvald/' - -If you need different HTML output just subclass and override the ``format_*`` methods. - -You can also ask for the span tags to be returned for each entity:: - - >>> p = ttp.Parser(include_spans=True) - >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/") - >>> result.urls - [('https://github.com/ianozsvald/', (57, 87))] - - - -Installation ------------- - - $ pip install twitter-text-python # via: http://pypi.python.org/pypi/twitter-text-python - $ python - >>> import ttp - >>> ttp.__version__ - '1.0.0' - - -Changelog ---------- - - * 2013/2/11 1.0.0 released to PyPI - - -Tests ------ - - $ python tests.py - ................................................................................................. - ---------------------------------------------------------------------- - Ran 97 tests in 0.009s - OK - - -Contributing ------------- - -The source is available on GitHub_, to -contribute to the project, fork it on GitHub and send a pull request. -Everyone is welcome to make improvements to **twp**! - -.. _GitHub: https://github.com/ianozsvald/twitter-text-python - - -License -------- - -*MIT* - -Copyright (c) 2012 Ivo Wetzel. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. - -Copyright (c) 2010-2013 Ivo Wetzel From 2ae04ff21c3492372ae7b492c34caad1536bd7e9 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:16:19 +0000 Subject: [PATCH 13/38] weird formatting bug --- README.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.rst b/README.rst index 3c04032..00227fa 100644 --- a/README.rst +++ b/README.rst @@ -6,3 +6,15 @@ twitter-text-python It is based on twitter-text-java_ and passes all the unittests of twitter-text-conformance_ plus some additional ones. +.. _twitter-text-java: http://github.com/mzsanford/twitter-text-java +.. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance + +This version was forked by Ian Ozsvald in January 2013 and released to PyPI, some bugs were fixed, a few minor changes to functionality added: +https://github.com/ianozsvald/twitter-text-python + +PyPI release: +http://pypi.python.org/pypi/twitter-text-python/ + +The original ttp comes from Ivo Wetzel (Ivo's version no longer supported): +https://github.com/BonsaiDen/twitter-text-python + From c8e40cd24c7219eba192f598870561025434a2f2 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:16:44 +0000 Subject: [PATCH 14/38] weird formatting bug --- README.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.rst b/README.rst index 00227fa..9e3d406 100644 --- a/README.rst +++ b/README.rst @@ -18,3 +18,19 @@ http://pypi.python.org/pypi/twitter-text-python/ The original ttp comes from Ivo Wetzel (Ivo's version no longer supported): https://github.com/BonsaiDen/twitter-text-python +Usage:: + + >>> import ttp + >>> p = ttp.Parser() + >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/") + >>> result.reply + 'ianozsvald' + >>> result.users + ['ianozsvald'] + >>> result.tags + ['IvoWertzel'] + >>> result.urls + ['https://github.com/ianozsvald/'] + >>> result.html + u'@ianozsvald, you now support #IvoWertzel\'s tweet parser! https://github.com/ianozsvald/' + From 77ff625ed016efe28d0087d301e66238d7fb81b0 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:17:14 +0000 Subject: [PATCH 15/38] weird formatting bug --- README.rst | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/README.rst b/README.rst index 9e3d406..51219ec 100644 --- a/README.rst +++ b/README.rst @@ -34,3 +34,49 @@ Usage:: >>> result.html u'@ianozsvald, you now support #IvoWertzel\'s tweet parser! https://github.com/ianozsvald/' +If you need different HTML output just subclass and override the ``format_*`` methods. + +You can also ask for the span tags to be returned for each entity:: + + >>> p = ttp.Parser(include_spans=True) + >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/") + >>> result.urls + [('https://github.com/ianozsvald/', (57, 87))] + + + +Installation +------------ + + $ pip install twitter-text-python # via: http://pypi.python.org/pypi/twitter-text-python + $ python + >>> import ttp + >>> ttp.__version__ + '1.0.0' + + +Changelog +--------- + + * 2013/2/11 1.0.0 released to PyPI + + +Tests +----- + + $ python tests.py + ................................................................................................. + ---------------------------------------------------------------------- + Ran 97 tests in 0.009s + OK + + +Contributing +------------ + +The source is available on GitHub_, to +contribute to the project, fork it on GitHub and send a pull request. +Everyone is welcome to make improvements to **twp**! + +.. _GitHub: https://github.com/ianozsvald/twitter-text-python + From 4297316f2e8a31cd6d6f4335ec444b22369795bb Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:18:05 +0000 Subject: [PATCH 16/38] weird formatting bug --- README.rst | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/README.rst b/README.rst index 51219ec..1ce4b94 100644 --- a/README.rst +++ b/README.rst @@ -58,7 +58,7 @@ Installation Changelog --------- - * 2013/2/11 1.0.0 released to PyPI +2013/2/11 1.0.0 released to PyPI Tests @@ -70,13 +70,3 @@ Tests Ran 97 tests in 0.009s OK - -Contributing ------------- - -The source is available on GitHub_, to -contribute to the project, fork it on GitHub and send a pull request. -Everyone is welcome to make improvements to **twp**! - -.. _GitHub: https://github.com/ianozsvald/twitter-text-python - From 22c73a9430065d013450fc1cbe5e38088c237c86 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:18:39 +0000 Subject: [PATCH 17/38] weird formatting bug --- README.rst | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/README.rst b/README.rst index 1ce4b94..987c767 100644 --- a/README.rst +++ b/README.rst @@ -55,18 +55,3 @@ Installation '1.0.0' -Changelog ---------- - -2013/2/11 1.0.0 released to PyPI - - -Tests ------ - - $ python tests.py - ................................................................................................. - ---------------------------------------------------------------------- - Ran 97 tests in 0.009s - OK - From e2e36155ab93586ceda15f84b6f7069ca35547f4 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:19:29 +0000 Subject: [PATCH 18/38] weird formatting bug --- README.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 987c767..4cd9fd0 100644 --- a/README.rst +++ b/README.rst @@ -48,7 +48,10 @@ You can also ask for the span tags to be returned for each entity:: Installation ------------ - $ pip install twitter-text-python # via: http://pypi.python.org/pypi/twitter-text-python +pip and easy_install will do the job:: + + # via: http://pypi.python.org/pypi/twitter-text-python + $ pip install twitter-text-python $ python >>> import ttp >>> ttp.__version__ From 4b8121cf335e238c4e80052c119b1413c7394d9b Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:20:47 +0000 Subject: [PATCH 19/38] weird formatting bug --- README.rst | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.rst b/README.rst index 4cd9fd0..b9c08d2 100644 --- a/README.rst +++ b/README.rst @@ -57,4 +57,23 @@ pip and easy_install will do the job:: >>> ttp.__version__ '1.0.0' +Changelog +--------- + +Release history:: + + * 2013/2/11 1.0.0 released to PyPI + + +Tests +----- + +Checkout the code via github https://github.com/ianozsvald/twitter-text-python and run tests locally:: + + $ python tests.py + ................................................................................................. + ---------------------------------------------------------------------- + Ran 97 tests in 0.009s + OK + From c024c5803f7ac81704baadfbe3ce11b9501fc1c2 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:21:14 +0000 Subject: [PATCH 20/38] weird formatting bug --- README.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.rst b/README.rst index b9c08d2..118f820 100644 --- a/README.rst +++ b/README.rst @@ -60,9 +60,7 @@ pip and easy_install will do the job:: Changelog --------- -Release history:: - - * 2013/2/11 1.0.0 released to PyPI + * 2013/2/11 1.0.0 released to PyPI Tests From 9b86dc1dfc0e7cfdb8e6b114416c36a22a1051d5 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:21:44 +0000 Subject: [PATCH 21/38] weird formatting bug --- README.rst | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/README.rst b/README.rst index 118f820..ef3c921 100644 --- a/README.rst +++ b/README.rst @@ -74,4 +74,40 @@ Checkout the code via github https://github.com/ianozsvald/twitter-text-python a Ran 97 tests in 0.009s OK +Contributing +------------ + +The source is available on GitHub_, to +contribute to the project, fork it on GitHub and send a pull request. +Everyone is welcome to make improvements to **twp**! + +.. _GitHub: https://github.com/ianozsvald/twitter-text-python + + +License +------- + +*MIT* + +Copyright (c) 2012 Ivo Wetzel. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +Copyright (c) 2010-2013 Ivo Wetzel From 400758b5cbe18d4a3b240261cca514952c0caa38 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 21:30:01 +0000 Subject: [PATCH 22/38] minor --- setup.py | 5 +++-- ttp.py | 2 -- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 8d1305a..9f00854 100644 --- a/setup.py +++ b/setup.py @@ -2,9 +2,10 @@ setup( name='twitter-text-python', - version='1.0', + version='1.0.0', description='Twitter Tweet parser and formatter', - long_description="no long description", #open('README.rst').read(), + long_description="Extract @users, #hashtags and URLs from tweets including entity locations, also generate HTML for output. Visit the github site for full instructions.", + #open('README.rst').read(), author='Maintained by Ian Ozsvald (originally by Ivo Wetzel)', author_email='ian@ianozsvald.com', url='https://github.com/ianozsvald/twitter-text-python', diff --git a/ttp.py b/ttp.py index b599d6d..5194297 100644 --- a/ttp.py +++ b/ttp.py @@ -13,8 +13,6 @@ # You should have received a copy of the GNU General Public License along with # twitter-text-python. If not, see . -# TODO create a setup.py - # Forked by Ian Ozsvald: # https://github.com/ianozsvald/twitter-text-python # from: From bdf73168bd859be69cb936456eab46915ee1d3c6 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 11 Feb 2013 22:08:35 +0000 Subject: [PATCH 23/38] version bump after fixing up setup.py to use a subdirectory --- README.rst | 8 ++++---- setup.py | 8 ++++---- __init__.py => ttp/__init__.py | 0 tests.py => ttp/tests.py | 0 ttp.py => ttp/ttp.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) rename __init__.py => ttp/__init__.py (100%) rename tests.py => ttp/tests.py (100%) rename ttp.py => ttp/ttp.py (99%) diff --git a/README.rst b/README.rst index ef3c921..ccfcd33 100644 --- a/README.rst +++ b/README.rst @@ -20,7 +20,7 @@ https://github.com/BonsaiDen/twitter-text-python Usage:: - >>> import ttp + >>> from ttp import ttp >>> p = ttp.Parser() >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/") >>> result.reply @@ -53,14 +53,14 @@ pip and easy_install will do the job:: # via: http://pypi.python.org/pypi/twitter-text-python $ pip install twitter-text-python $ python - >>> import ttp + >>> from ttp import ttp >>> ttp.__version__ - '1.0.0' + '1.0.0.2' Changelog --------- - * 2013/2/11 1.0.0 released to PyPI + * 2013/2/11 1.0.0.2 released to PyPI Tests diff --git a/setup.py b/setup.py index 9f00854..60070b1 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='twitter-text-python', - version='1.0.0', + version='1.0.0.2', description='Twitter Tweet parser and formatter', long_description="Extract @users, #hashtags and URLs from tweets including entity locations, also generate HTML for output. Visit the github site for full instructions.", #open('README.rst').read(), @@ -10,17 +10,17 @@ author_email='ian@ianozsvald.com', url='https://github.com/ianozsvald/twitter-text-python', license='MIT', - py_modules=['ttp', 'tests'], + packages=['ttp'], include_package_data=True, zip_safe=False, install_requires=[], - #data_files=[('./', ['README.rst'])], classifiers=[ 'Environment :: Console', 'Intended Audience :: Developers', - #'License :: OSI Approved :: GPL License', + 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', 'Programming Language :: Python', 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Text Processing :: Linguistic', ] ) diff --git a/__init__.py b/ttp/__init__.py similarity index 100% rename from __init__.py rename to ttp/__init__.py diff --git a/tests.py b/ttp/tests.py similarity index 100% rename from tests.py rename to ttp/tests.py diff --git a/ttp.py b/ttp/ttp.py similarity index 99% rename from ttp.py rename to ttp/ttp.py index 5194297..2996038 100644 --- a/ttp.py +++ b/ttp/ttp.py @@ -23,7 +23,7 @@ import re import urllib -__version__ = "1.0.0" +__version__ = "1.0.0.2" # Some of this code has been translated from the twitter-text-java library: # From 52c61013ce1fedfe2fc640dd48f93f278beeb4a2 Mon Sep 17 00:00:00 2001 From: Lee Semel Date: Mon, 25 Mar 2013 10:27:04 -0400 Subject: [PATCH 24/38] Fix t.co urls followed by a comma --- ttp/tests.py | 5 +++++ ttp/ttp.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ttp/tests.py b/ttp/tests.py index 26fb373..4d01e1d 100644 --- a/ttp/tests.py +++ b/ttp/tests.py @@ -231,6 +231,11 @@ def test_url_followed_comma(self): self.assertEqual(result.html, u'text http://example.com,') self.assertEqual(result.urls, [u'http://example.com']) + def test_url_with_path_followed_comma(self): + result = self.parser.parse(u'text http://example.com/abcde, more') + self.assertEqual(result.html, u'text http://example.com/abcde, more') + self.assertEqual(result.urls, [u'http://example.com/abcde']) + def test_url_followed_brace(self): result = self.parser.parse(u'text http://example.com)') self.assertEqual(result.html, u'text http://example.com)') diff --git a/ttp/ttp.py b/ttp/ttp.py index 2996038..b8f8404 100644 --- a/ttp/ttp.py +++ b/ttp/ttp.py @@ -59,7 +59,7 @@ PATH_ENDING_CHARS = r'[%s\)=#/]' % UTF_CHARS QUERY_ENDING_CHARS = '[a-z0-9_&=#]' -URL_REGEX = re.compile('((%s)((https?://|www\\.)(%s)(\/%s*%s?)?(\?%s*%s)?))' +URL_REGEX = re.compile('((%s)((https?://|www\\.)(%s)(\/(%s*%s)?)?(\?%s*%s)?))' % (PRE_CHARS, DOMAIN_CHARS, PATH_CHARS, PATH_ENDING_CHARS, QUERY_CHARS, QUERY_ENDING_CHARS), re.IGNORECASE) From a9973f9cb3456bf45512fb41c343285b118b8d22 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Tue, 26 Mar 2013 11:23:53 +0000 Subject: [PATCH 25/38] added some notes for TODO --- README.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.rst b/README.rst index ccfcd33..5be038d 100644 --- a/README.rst +++ b/README.rst @@ -84,6 +84,13 @@ Everyone is welcome to make improvements to **twp**! .. _GitHub: https://github.com/ianozsvald/twitter-text-python +Todo +---- + + * Consider adding capitalised phrase identification + * Make it 1 line to parse and get a results dict via __init__.py + * Tag the next release + License ------- From 19e2368e3e41cd769f733e5db1a1c713419ecb83 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Thu, 28 Mar 2013 14:52:31 +0000 Subject: [PATCH 26/38] bump of version nbr for this new working version, added a shortlink follower in utils.py --- README.rst | 9 +++++++++ ttp/ttp.py | 2 +- ttp/utils.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 ttp/utils.py diff --git a/README.rst b/README.rst index 5be038d..f7cf047 100644 --- a/README.rst +++ b/README.rst @@ -44,6 +44,14 @@ You can also ask for the span tags to be returned for each entity:: [('https://github.com/ianozsvald/', (57, 87))] +To use the shortlink follower: + + >>> from ttp import utils + >>> # assume that result.urls == ['http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF'] + >>> print utils.follow_shortlinks(result.urls) # pass in list of shortlink URLs + {'http://t.co/8o0z9BbEMu': [u'http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF', u'http://www.bbc.co.uk/sport/0/21711199#TWEET650562'], u'http://bbc.in/16dClPF': [u'http://bbc.in/16dClPF', u'http://www.bbc.co.uk/sport/0/21711199#TWEET650562']} + >>> # note that bad shortlink URLs have a key to an empty list (lost/forgotten shortlink URLs don't generate any error) + Installation ------------ @@ -61,6 +69,7 @@ Changelog --------- * 2013/2/11 1.0.0.2 released to PyPI + * 2013/4/? 1.0.1 new working version Tests diff --git a/ttp/ttp.py b/ttp/ttp.py index 2996038..4b7cb83 100644 --- a/ttp/ttp.py +++ b/ttp/ttp.py @@ -23,7 +23,7 @@ import re import urllib -__version__ = "1.0.0.2" +__version__ = "1.0.1.0" # Some of this code has been translated from the twitter-text-java library: # diff --git a/ttp/utils.py b/ttp/utils.py new file mode 100644 index 0000000..2c3d822 --- /dev/null +++ b/ttp/utils.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Unwind short-links e.g. bit.ly, t.co etc to their canonical links""" +import requests + + +def follow_shortlinks(shortlinks): + """Follow redirects in list of shortlinks, return dict of resulting URLs""" + links_followed = {} + for shortlink in shortlinks: + url = shortlink + request_result = requests.get(url) + redirect_history = request_result.history + # history might look like: + # (, ) + # where each response object has a URL + all_urls = [] + for redirect in redirect_history: + all_urls.append(redirect.url) + # append the final URL that we finish with + all_urls.append(request_result.url) + links_followed[shortlink] = all_urls + return links_followed + + +if __name__ == "__main__": + shortlinks = ['http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF'] + print follow_shortlinks(shortlinks) From 1bab751f5208aa0c8f4b309d44ed94c87478a13e Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Thu, 28 Mar 2013 15:10:33 +0000 Subject: [PATCH 27/38] added requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6a99645 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests==1.1.0 From dd4e9322d01985b69e84c51e47ac063570d4170e Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Thu, 4 Apr 2013 21:59:03 +0100 Subject: [PATCH 28/38] adding some , parsing --- README.rst | 1 + ttp/tests.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/README.rst b/README.rst index f7cf047..44da770 100644 --- a/README.rst +++ b/README.rst @@ -97,6 +97,7 @@ Todo ---- * Consider adding capitalised phrase identification + * Consider adding a repeated-char remover (e.g. grrrrrrr->grr) * Make it 1 line to parse and get a results dict via __init__.py * Tag the next release diff --git a/ttp/tests.py b/ttp/tests.py index 4d01e1d..d302537 100644 --- a/ttp/tests.py +++ b/ttp/tests.py @@ -231,11 +231,21 @@ def test_url_followed_comma(self): self.assertEqual(result.html, u'text http://example.com,') self.assertEqual(result.urls, [u'http://example.com']) + def test_url_with_path_preceeded_by_comma(self): + result = self.parser.parse(u'text ,http://example.com/abcde, more') + self.assertEqual(result.html, u'text ,http://example.com/abcde, more') + self.assertEqual(result.urls, [u'http://example.com/abcde']) + def test_url_with_path_followed_comma(self): result = self.parser.parse(u'text http://example.com/abcde, more') self.assertEqual(result.html, u'text http://example.com/abcde, more') self.assertEqual(result.urls, [u'http://example.com/abcde']) + def test_url_with_path_followed_commas(self): + result = self.parser.parse(u'text http://example.com/abcde,, more') + self.assertEqual(result.html, u'text http://example.com/abcde,, more') + self.assertEqual(result.urls, [u'http://example.com/abcde']) + def test_url_followed_brace(self): result = self.parser.parse(u'text http://example.com)') self.assertEqual(result.html, u'text http://example.com)') From 4b2d7a02e5fb149f8200ca14597e83f00c62b273 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Thu, 4 Apr 2013 22:00:57 +0100 Subject: [PATCH 29/38] extra note on how to run tests --- README.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 44da770..a8da4cc 100644 --- a/README.rst +++ b/README.rst @@ -69,7 +69,7 @@ Changelog --------- * 2013/2/11 1.0.0.2 released to PyPI - * 2013/4/? 1.0.1 new working version + * 2013/4/? 1.0.1 new working version, adding comma parse fix (thanks https://github.com/muckrack) Tests @@ -77,12 +77,13 @@ Tests Checkout the code via github https://github.com/ianozsvald/twitter-text-python and run tests locally:: - $ python tests.py - ................................................................................................. + $ python ttp/tests.py + .................................................................................................... ---------------------------------------------------------------------- - Ran 97 tests in 0.009s + Ran 100 tests in 0.009s OK + Contributing ------------ From f80d89c5d86873bddda5071346ac1b8848103e80 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Sat, 1 Jun 2013 13:33:40 +0100 Subject: [PATCH 30/38] used autopep8 to clean up the src --- ttp/tests.py | 38 ++++++++++++++++---------------------- ttp/ttp.py | 17 ++++++++--------- 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/ttp/tests.py b/ttp/tests.py index d302537..39aa5ab 100644 --- a/ttp/tests.py +++ b/ttp/tests.py @@ -22,10 +22,10 @@ class TWPTests(unittest.TestCase): + def setUp(self): self.parser = ttp.Parser() - # General Tests ------------------------------------------------------------ # -------------------------------------------------------------------------- def test_urls(self): @@ -45,7 +45,6 @@ def test_all_not_break_url_at(self): self.assertEqual(result.html, u'http://www.flickr.com/photo...') self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406']) - # URL tests ---------------------------------------------------------------- # -------------------------------------------------------------------------- def test_url_mid(self): @@ -90,7 +89,8 @@ def test_url_dash(self): def test_url_multiple(self): result = self.parser.parse(u'http://example.com https://sslexample.com http://sub.example.com') - self.assertEqual(result.html, u'http://example.com https://sslexample.com http://sub.example.com') + self.assertEqual( + result.html, u'http://example.com https://sslexample.com http://sub.example.com') self.assertEqual(result.urls, [u'http://example.com', u'https://sslexample.com', u'http://sub.example.com']) def test_url_raw_domain(self): @@ -162,7 +162,6 @@ def test_url_long_hypens(self): self.assertEqual(result.html, u'text http://word-and-a-number-8-...') self.assertEqual(result.urls, [u'http://word-and-a-number-8-ftw.domain.tld/']) - # URL not tests ------------------------------------------------------------ def test_not_url_dotdotdot(self): result = self.parser.parse(u'Is www...foo a valid URL?') @@ -194,7 +193,6 @@ def test_not_url_one_letter_iana(self): self.assertEqual(result.html, u'text http://a.com/ http://a.net/ http://a.org/') self.assertEqual(result.urls, []) - # URL followed Tests ------------------------------------------------------- def test_url_followed_question(self): result = self.parser.parse(u'text http://example.com?') @@ -271,7 +269,6 @@ def test_url_followed_hypen(self): self.assertEqual(result.html, u'text http://domain.tld-that-you-should-have-put-a-space-after') self.assertEqual(result.urls, [u'http://domain.tld']) - # URL preceeded Tests ------------------------------------------------------- def test_url_preceeded_colon(self): result = self.parser.parse(u'text:http://example.com') @@ -294,7 +291,6 @@ def test_not_url_preceeded_exclamation(self): self.assertEqual(result.html, u'text !http://example.com') self.assertEqual(result.urls, []) - # URL numeric tests -------------------------------------------------------- def test_url_at_numeric(self): result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406') @@ -306,7 +302,6 @@ def test_url_at_non_numeric(self): self.assertEqual(result.html, u'http://www.flickr.com/photo...') self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/foobar']) - # URL domain tests --------------------------------------------------------- def test_url_WWW(self): result = self.parser.parse(u'WWW.EXAMPLE.COM') @@ -320,7 +315,8 @@ def test_url_www(self): def test_url_only_domain_query_followed_period(self): result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.i.want.it. Even when they contain a URL.') - self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.... Even when they contain a URL.') + self.assertEqual( + result.html, u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.... Even when they contain a URL.') self.assertEqual(result.urls, [u'http://tell.me/why?=because.i.want.it']) def test_url_only_domain_followed_period(self): @@ -354,7 +350,6 @@ def test_not_url_under_domain(self): self.assertEqual(result.html, u'badly formatted http://foo_bar.com') self.assertEqual(result.urls, []) - # Hashtag tests ------------------------------------------------------------ # -------------------------------------------------------------------------- def test_hashtag_followed_full_whitespace(self): @@ -432,7 +427,6 @@ def test_hashtag_under(self): self.assertEqual(result.html, u'text #hash_tag') self.assertEqual(result.tags, [u'hash_tag']) - # Username tests ----------------------------------------------------------- # -------------------------------------------------------------------------- def test_not_username_preceded_letter(self): @@ -515,7 +509,6 @@ def test_username_non_reply(self): self.assertEqual(result.users, [u'username']) self.assertEqual(result.reply, None) - # List tests --------------------------------------------------------------- # -------------------------------------------------------------------------- def test_list_preceeded(self): @@ -561,7 +554,8 @@ def test_list_not_preceeded_letter(self): def test_list_long_truncate(self): result = self.parser.parse(u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A') - self.assertEqual(result.html, u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A') + self.assertEqual( + result.html, u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A') self.assertEqual(result.lists, [(u'username', u'list5678901234567890123456789012345678901234567890123456789012345678901234567890')]) def test_list_with_dash(self): @@ -571,9 +565,10 @@ def test_list_with_dash(self): class TWPTestsWithSpans(unittest.TestCase): + """Test ttp with re spans to extract character co-ords of matches""" def setUp(self): - self.parser = ttp.Parser(include_spans = True) + self.parser = ttp.Parser(include_spans=True) def test_spans_in_tweets(self): """Test some coca-cola tweets taken from twitter with spans""" @@ -606,15 +601,14 @@ def test_edge_cases(self): self.assertEqual(result.urls, [(u'http://some.com', (1, 16))]) - # Test it! if __name__ == '__main__': unittest.main() - #verbosity = 0 # set to 2 for verbose output - #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpansEdgeCases) - #unittest.TextTestRunner(verbosity=verbosity).run(suite) - #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans) - #unittest.TextTestRunner(verbosity=verbosity).run(suite) - #suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests) - #unittest.TextTestRunner(verbosity=verbosity).run(suite) + # verbosity = 0 # set to 2 for verbose output + # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpansEdgeCases) + # unittest.TextTestRunner(verbosity=verbosity).run(suite) + # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans) + # unittest.TextTestRunner(verbosity=verbosity).run(suite) + # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests) + # unittest.TextTestRunner(verbosity=verbosity).run(suite) diff --git a/ttp/ttp.py b/ttp/ttp.py index 1202f2c..ac7c79e 100644 --- a/ttp/ttp.py +++ b/ttp/ttp.py @@ -39,8 +39,8 @@ # Users USERNAME_REGEX = re.compile(ur'\B' + AT_SIGNS + LIST_END_CHARS, re.IGNORECASE) -REPLY_REGEX = re.compile(ur'^(?:' + SPACES + ur')*' + AT_SIGNS \ - + ur'([a-z0-9_]{1,20}).*', re.IGNORECASE) +REPLY_REGEX = re.compile(ur'^(?:' + SPACES + ur')*' + AT_SIGNS + + ur'([a-z0-9_]{1,20}).*', re.IGNORECASE) # Hashtags HASHTAG_EXP = ur'(^|[^0-9A-Z&/]+)(#|\uff03)([0-9A-Z_]*[A-Z_]+[%s]*)' % UTF_CHARS @@ -62,13 +62,14 @@ URL_REGEX = re.compile('((%s)((https?://|www\\.)(%s)(\/(%s*%s)?)?(\?%s*%s)?))' % (PRE_CHARS, DOMAIN_CHARS, PATH_CHARS, PATH_ENDING_CHARS, QUERY_CHARS, QUERY_ENDING_CHARS), - re.IGNORECASE) + re.IGNORECASE) # Registered IANA one letter domains IANA_ONE_LETTER_DOMAINS = ('x.com', 'x.org', 'z.com', 'q.net', 'q.com', 'i.net') class ParseResult(object): + '''A class containing the results of a parsed Tweet. Attributes: @@ -109,9 +110,10 @@ def __init__(self, urls, users, reply, lists, tags, html): class Parser(object): + '''A Tweet Parser''' - def __init__(self, max_url_length=30, include_spans = False): + def __init__(self, max_url_length=30, include_spans=False): self._max_url_length = max_url_length self._include_spans = include_spans @@ -144,7 +146,6 @@ def _html(self, text): html = LIST_REGEX.sub(self._parse_lists, html) return HASHTAG_REGEX.sub(self._parse_tags, html) - # Internal parser stuff ---------------------------------------------------- def _parse_urls(self, match): '''Parse URLs.''' @@ -186,7 +187,7 @@ def _parse_urls(self, match): if self._html: return '%s%s' % (pre, self.format_url(full_url, - self._shorten_url(escape(url)))) + self._shorten_url(escape(url)))) def _parse_users(self, match): '''Parse usernames.''' @@ -261,12 +262,11 @@ def _shorten_url(self, text): else: return text - # User defined formatters -------------------------------------------------- def format_tag(self, tag, text): '''Return formatted HTML for a hashtag.''' return '%s%s' \ - % (urllib.quote('#' + text.encode('utf-8')), tag, text) + % (urllib.quote('#' + text.encode('utf-8')), tag, text) def format_username(self, at_char, user): '''Return formatted HTML for a username.''' @@ -289,4 +289,3 @@ def escape(text): return ''.join({'&': '&', '"': '"', '\'': ''', '>': '>', '<': '<'}.get(c, c) for c in text) - From 93f6985a4cf68d84d83e0ac30f1c550a0f3c318c Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Sat, 1 Jun 2013 13:43:54 +0100 Subject: [PATCH 31/38] minor --- README.rst | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index a8da4cc..c3f76ec 100644 --- a/README.rst +++ b/README.rst @@ -3,8 +3,8 @@ twitter-text-python **twitter-text-python** is a Tweet parser and formatter for Python. Extract users, hashtags, URLs and format as HTML for display. -It is based on twitter-text-java_ and passes all the unittests of -twitter-text-conformance_ plus some additional ones. +It is based on twitter-text-java_ and did pass all the unittests of +twitter-text-conformance_ plus some additional ones. Note that the conformance tests are now behind (easy PR for someone to work on: https://github.com/ianozsvald/twitter-text-python/issues/5 ): .. _twitter-text-java: http://github.com/mzsanford/twitter-text-java .. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance @@ -69,7 +69,7 @@ Changelog --------- * 2013/2/11 1.0.0.2 released to PyPI - * 2013/4/? 1.0.1 new working version, adding comma parse fix (thanks https://github.com/muckrack) + * 2013/6/1 1.0.1 new working version, adding comma parse fix (thanks https://github.com/muckrack), used autopep8 to clean the src, added a shortlink expander Tests @@ -102,6 +102,12 @@ Todo * Make it 1 line to parse and get a results dict via __init__.py * Tag the next release +Doing a release +--------------- + +In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt + + License ------- From e00cad8e67bc018c93a6f693a96634b26e903f12 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Sat, 1 Jun 2013 13:55:04 +0100 Subject: [PATCH 32/38] notes on pypi release and git tagging --- README.rst | 7 ++++++- setup.py | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index c3f76ec..15bd337 100644 --- a/README.rst +++ b/README.rst @@ -105,7 +105,12 @@ Todo Doing a release --------------- -In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt +In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form: + + $ # edit setup.py to bump the version number + $ git tag -a v1.0.1 -m 'v1.0.1 release' + $ ianozsvald-twitter-text-python $ python setup.py sdist register upload -r http://pypi.python.org/pypi + $ # this uses ~/.pypirc with cached login details License diff --git a/setup.py b/setup.py index 60070b1..2a9dd84 100644 --- a/setup.py +++ b/setup.py @@ -2,9 +2,9 @@ setup( name='twitter-text-python', - version='1.0.0.2', + version='1.0.1', description='Twitter Tweet parser and formatter', - long_description="Extract @users, #hashtags and URLs from tweets including entity locations, also generate HTML for output. Visit the github site for full instructions.", + long_description="Extract @users, #hashtags and URLs (and unwind shortened links) from tweets including entity locations, also generate HTML for output. Visit https://github.com/ianozsvald/twitter-text-python for examples.", #open('README.rst').read(), author='Maintained by Ian Ozsvald (originally by Ivo Wetzel)', author_email='ian@ianozsvald.com', From 07240991f5a4a54c671ea386ba0961ff56ab936b Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Sat, 1 Jun 2013 13:56:05 +0100 Subject: [PATCH 33/38] note on pushing tags --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 15bd337..1658b54 100644 --- a/README.rst +++ b/README.rst @@ -109,6 +109,7 @@ In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The shor $ # edit setup.py to bump the version number $ git tag -a v1.0.1 -m 'v1.0.1 release' + $ git push origin --tags $ ianozsvald-twitter-text-python $ python setup.py sdist register upload -r http://pypi.python.org/pypi $ # this uses ~/.pypirc with cached login details From 033a5abd173cde36378d5be4f1a2165573381a58 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Sat, 1 Jun 2013 13:57:12 +0100 Subject: [PATCH 34/38] cleanup --- README.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 1658b54..6d3f315 100644 --- a/README.rst +++ b/README.rst @@ -107,11 +107,11 @@ Doing a release In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form: - $ # edit setup.py to bump the version number - $ git tag -a v1.0.1 -m 'v1.0.1 release' - $ git push origin --tags - $ ianozsvald-twitter-text-python $ python setup.py sdist register upload -r http://pypi.python.org/pypi - $ # this uses ~/.pypirc with cached login details + $ # edit setup.py to bump the version number + $ git tag -a v1.0.1 -m 'v1.0.1 release' + $ git push origin --tags + $ ianozsvald-twitter-text-python $ python setup.py sdist register upload -r http://pypi.python.org/pypi + $ # this uses ~/.pypirc with cached login details License From 66c209bdd53b6c27a9f509f03fbd952cecc69cce Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Sat, 1 Jun 2013 13:58:10 +0100 Subject: [PATCH 35/38] cleanup --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 6d3f315..8935b66 100644 --- a/README.rst +++ b/README.rst @@ -107,7 +107,7 @@ Doing a release In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form: - $ # edit setup.py to bump the version number + $ # edit setup.py to bump the version number (ignore) $ git tag -a v1.0.1 -m 'v1.0.1 release' $ git push origin --tags $ ianozsvald-twitter-text-python $ python setup.py sdist register upload -r http://pypi.python.org/pypi From aa6bf1acd0b1fa144bf212b184bfb6c947c9fc8f Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Sat, 1 Jun 2013 13:58:50 +0100 Subject: [PATCH 36/38] cleanup --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 8935b66..17b0189 100644 --- a/README.rst +++ b/README.rst @@ -105,9 +105,9 @@ Todo Doing a release --------------- -In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form: +In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form:: - $ # edit setup.py to bump the version number (ignore) + $ # edit setup.py to bump the version number $ git tag -a v1.0.1 -m 'v1.0.1 release' $ git push origin --tags $ ianozsvald-twitter-text-python $ python setup.py sdist register upload -r http://pypi.python.org/pypi From 756f947a4322f86337180ba9f498140f74d47f35 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 28 Jul 2014 22:43:40 +0100 Subject: [PATCH 37/38] point to Ed for his support --- README.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.rst b/README.rst index 17b0189..607221c 100644 --- a/README.rst +++ b/README.rst @@ -3,6 +3,10 @@ twitter-text-python **twitter-text-python** is a Tweet parser and formatter for Python. Extract users, hashtags, URLs and format as HTML for display. +---- +**UPDATE** this project is _now maintained by Ed Burnett_. Please go here for the active version: https://github.com/edburnett/twitter-text-python +---- + It is based on twitter-text-java_ and did pass all the unittests of twitter-text-conformance_ plus some additional ones. Note that the conformance tests are now behind (easy PR for someone to work on: https://github.com/ianozsvald/twitter-text-python/issues/5 ): @@ -56,6 +60,8 @@ To use the shortlink follower: Installation ------------ +**NOTE** this version (Ian's) is no longer maintained, see Ed's active version instead: https://github.com/edburnett/twitter-text-python + pip and easy_install will do the job:: # via: http://pypi.python.org/pypi/twitter-text-python From 13f4990cd5e1c8b6b424ac867fb7d72a8e0aa330 Mon Sep 17 00:00:00 2001 From: Ian Ozsvald Date: Mon, 28 Jul 2014 22:44:41 +0100 Subject: [PATCH 38/38] point to Ed for his support --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 607221c..2ed8cf0 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ twitter-text-python **twitter-text-python** is a Tweet parser and formatter for Python. Extract users, hashtags, URLs and format as HTML for display. ---- -**UPDATE** this project is _now maintained by Ed Burnett_. Please go here for the active version: https://github.com/edburnett/twitter-text-python +**UPDATE** this project is now maintained by Ed Burnett, please go here for the active version: https://github.com/edburnett/twitter-text-python ---- It is based on twitter-text-java_ and did pass all the unittests of