diff --git a/README.rst b/README.rst index 207d3a9..2ed8cf0 100644 --- a/README.rst +++ b/README.rst @@ -1,34 +1,94 @@ twitter-text-python =================== -**twitter-text-python** is a Tweet parser and formatter for Python. +**twitter-text-python** is a Tweet parser and formatter for Python. Extract users, hashtags, URLs and format as HTML for display. -It is based on twitter-text-java_ and passes all the unittests of -twitter-text-conformance_ plus some additional ones. +---- +**UPDATE** this project is now maintained by Ed Burnett, please go here for the active version: https://github.com/edburnett/twitter-text-python +---- + +It is based on twitter-text-java_ and did pass all the unittests of +twitter-text-conformance_ plus some additional ones. Note that the conformance tests are now behind (easy PR for someone to work on: https://github.com/ianozsvald/twitter-text-python/issues/5 ): .. _twitter-text-java: http://github.com/mzsanford/twitter-text-java .. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance +This version was forked by Ian Ozsvald in January 2013 and released to PyPI, some bugs were fixed, a few minor changes to functionality added: +https://github.com/ianozsvald/twitter-text-python + +PyPI release: +http://pypi.python.org/pypi/twitter-text-python/ + +The original ttp comes from Ivo Wetzel (Ivo's version no longer supported): +https://github.com/BonsaiDen/twitter-text-python + Usage:: - >>> import ttp + >>> from ttp import ttp >>> p = ttp.Parser() - >>> result = p.parse("@BonsaiDen Hey that's a great Tweet parser! #twp") + >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/") >>> result.reply - 'BonsaiDen' + 'ianozsvald' >>> result.users - ['BonsaiDen'] + ['ianozsvald'] >>> result.tags - ['twp'] + ['IvoWertzel'] >>> result.urls - [] + ['https://github.com/ianozsvald/'] >>> result.html - u'@BonsaiDen Hey that\'s a great Tweet Parser! - #twp' - + u'@ianozsvald, you now support #IvoWertzel\'s tweet parser! https://github.com/ianozsvald/' If you need different HTML output just subclass and override the ``format_*`` methods. +You can also ask for the span tags to be returned for each entity:: + + >>> p = ttp.Parser(include_spans=True) + >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/") + >>> result.urls + [('https://github.com/ianozsvald/', (57, 87))] + + +To use the shortlink follower: + + >>> from ttp import utils + >>> # assume that result.urls == ['http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF'] + >>> print utils.follow_shortlinks(result.urls) # pass in list of shortlink URLs + {'http://t.co/8o0z9BbEMu': [u'http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF', u'http://www.bbc.co.uk/sport/0/21711199#TWEET650562'], u'http://bbc.in/16dClPF': [u'http://bbc.in/16dClPF', u'http://www.bbc.co.uk/sport/0/21711199#TWEET650562']} + >>> # note that bad shortlink URLs have a key to an empty list (lost/forgotten shortlink URLs don't generate any error) + + +Installation +------------ + +**NOTE** this version (Ian's) is no longer maintained, see Ed's active version instead: https://github.com/edburnett/twitter-text-python + +pip and easy_install will do the job:: + + # via: http://pypi.python.org/pypi/twitter-text-python + $ pip install twitter-text-python + $ python + >>> from ttp import ttp + >>> ttp.__version__ + '1.0.0.2' + +Changelog +--------- + + * 2013/2/11 1.0.0.2 released to PyPI + * 2013/6/1 1.0.1 new working version, adding comma parse fix (thanks https://github.com/muckrack), used autopep8 to clean the src, added a shortlink expander + + +Tests +----- + +Checkout the code via github https://github.com/ianozsvald/twitter-text-python and run tests locally:: + + $ python ttp/tests.py + .................................................................................................... + ---------------------------------------------------------------------- + Ran 100 tests in 0.009s + OK + Contributing ------------ @@ -37,23 +97,53 @@ The source is available on GitHub_, to contribute to the project, fork it on GitHub and send a pull request. Everyone is welcome to make improvements to **twp**! -.. _GitHub: http://github.com/BonsaiDen/twitter-text-python +.. _GitHub: https://github.com/ianozsvald/twitter-text-python + + +Todo +---- + + * Consider adding capitalised phrase identification + * Consider adding a repeated-char remover (e.g. grrrrrrr->grr) + * Make it 1 line to parse and get a results dict via __init__.py + * Tag the next release + +Doing a release +--------------- + +In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form:: + + $ # edit setup.py to bump the version number + $ git tag -a v1.0.1 -m 'v1.0.1 release' + $ git push origin --tags + $ ianozsvald-twitter-text-python $ python setup.py sdist register upload -r http://pypi.python.org/pypi + $ # this uses ~/.pypirc with cached login details + License -======= +------- + +*MIT* + +Copyright (c) 2012 Ivo Wetzel. -Copyright (c) 2010 Ivo Wetzel +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: -**twitter-text-python** is free software: you can redistribute it and/or -modify it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. -**twitter-text-python** is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. -You should have received a copy of the GNU General Public License along with -**twitter-text-python**. If not, see . +Copyright (c) 2010-2013 Ivo Wetzel diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6a99645 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests==1.1.0 diff --git a/setup.py b/setup.py index 9de7d83..2a9dd84 100644 --- a/setup.py +++ b/setup.py @@ -2,24 +2,25 @@ setup( name='twitter-text-python', - version='1.0', - description='Tweet parser and formatter', - long_description=open('README.rst').read(), - author='Ivo Wetzel', - author_email='', - url='http://github.com/BonsaiDen/twitter-text-python', - license='GPL', - py_modules=['ttp'], + version='1.0.1', + description='Twitter Tweet parser and formatter', + long_description="Extract @users, #hashtags and URLs (and unwind shortened links) from tweets including entity locations, also generate HTML for output. Visit https://github.com/ianozsvald/twitter-text-python for examples.", + #open('README.rst').read(), + author='Maintained by Ian Ozsvald (originally by Ivo Wetzel)', + author_email='ian@ianozsvald.com', + url='https://github.com/ianozsvald/twitter-text-python', + license='MIT', + packages=['ttp'], include_package_data=True, zip_safe=False, install_requires=[], classifiers=[ - 'Environment :: Web Environment', - # I don't know what exactly this means, but why not? + 'Environment :: Console', 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', + 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', 'Programming Language :: Python', 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Text Processing :: Linguistic', ] ) diff --git a/ttp/__init__.py b/ttp/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests.py b/ttp/tests.py similarity index 85% rename from tests.py rename to ttp/tests.py index e084abc..39aa5ab 100644 --- a/tests.py +++ b/ttp/tests.py @@ -22,534 +22,593 @@ class TWPTests(unittest.TestCase): + def setUp(self): self.parser = ttp.Parser() - - + # General Tests ------------------------------------------------------------ # -------------------------------------------------------------------------- + def test_urls(self): + """Confirm that # in a URL works along with ,""" + result = self.parser.parse(u'big url: http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag') + self.assertEqual(result.urls, [u'http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2']) + self.assertEqual(result.tags, [u'ahashtag']) + def test_all_not_allow_amp_without_question(self): result = self.parser.parse(u'Check out: http://www.github.com/test&@username') self.assertEqual(result.html, u'Check out: http://www.github.com/test&@username') self.assertEqual(result.users, [u'username']) self.assertEqual(result.urls, [u'http://www.github.com/test']) - + def test_all_not_break_url_at(self): result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406') self.assertEqual(result.html, u'http://www.flickr.com/photo...') self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406']) - - + # URL tests ---------------------------------------------------------------- # -------------------------------------------------------------------------- def test_url_mid(self): result = self.parser.parse(u'text http://example.com more text') self.assertEqual(result.html, u'text http://example.com more text') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_unicode(self): result = self.parser.parse(u'I enjoy Macintosh Brand computers: http://✪df.ws/ejp') self.assertEqual(result.html, u'I enjoy Macintosh Brand computers: http://✪df.ws/ejp') self.assertEqual(result.urls, [u'http://\u272adf.ws/ejp']) - + def test_url_parentheses(self): result = self.parser.parse(u'text (http://example.com)') self.assertEqual(result.html, u'text (http://example.com)') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_underscore(self): result = self.parser.parse(u'text http://example.com/test/foo_123.jpg') self.assertEqual(result.html, u'text http://example.com/test/foo...') self.assertEqual(result.urls, [u'http://example.com/test/foo_123.jpg']) - + def test_url_underscore_dot(self): result = self.parser.parse(u'text http://example.com/test/bla.net_foo_123.jpg') self.assertEqual(result.html, u'text http://example.com/test/bla...') self.assertEqual(result.urls, [u'http://example.com/test/bla.net_foo_123.jpg']) - + def test_url_amp_lang_equals(self): result = self.parser.parse(u'Check out http://search.twitter.com/search?q=avro&lang=en') self.assertEqual(result.html, u'Check out http://search.twitter.com/s...') self.assertEqual(result.urls, [u'http://search.twitter.com/search?q=avro&lang=en']) - + def test_url_amp_break(self): result = self.parser.parse(u'Check out http://twitter.com/te?foo&invalid=True') self.assertEqual(result.html, u'Check out http://twitter.com/te?foo...') self.assertEqual(result.urls, [u'http://twitter.com/te?foo&invalid=True']) - + def test_url_dash(self): result = self.parser.parse(u'Is www.foo-bar.com a valid URL?') self.assertEqual(result.html, u'Is www.foo-bar.com a valid URL?') self.assertEqual(result.urls, [u'www.foo-bar.com']) - + def test_url_multiple(self): result = self.parser.parse(u'http://example.com https://sslexample.com http://sub.example.com') - self.assertEqual(result.html, u'http://example.com https://sslexample.com http://sub.example.com') + self.assertEqual( + result.html, u'http://example.com https://sslexample.com http://sub.example.com') self.assertEqual(result.urls, [u'http://example.com', u'https://sslexample.com', u'http://sub.example.com']) - + def test_url_raw_domain(self): result = self.parser.parse(u'See http://example.com example.com') self.assertEqual(result.html, u'See http://example.com example.com') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_embed_link(self): result = self.parser.parse(u'http://example.com') self.assertEqual(result.html, u'http://example.com') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_trailing(self): result = self.parser.parse(u'text http://example.com') self.assertEqual(result.html, u'text http://example.com') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_japanese(self): result = self.parser.parse(u'いまなにしてるhttp://example.comいまなにしてる') self.assertEqual(result.html, u'いまなにしてるhttp://example.comいまなにしてる') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_lots_of_punctuation(self): result = self.parser.parse(u'text http://xo.com/~matthew+%-,.;x') self.assertEqual(result.html, u'text http://xo.com/~matthew+%-,.;x') self.assertEqual(result.urls, [u'http://xo.com/~matthew+%-,.;x']) - + def test_url_question_numbers(self): result = self.parser.parse(u'text http://example.com/?77e8fd') self.assertEqual(result.html, u'text http://example.com/?77e8fd') self.assertEqual(result.urls, [u'http://example.com/?77e8fd']) - + def test_url_one_letter_other(self): result = self.parser.parse(u'text http://u.nu/') self.assertEqual(result.html, u'text http://u.nu/') self.assertEqual(result.urls, [u'http://u.nu/']) - + result = self.parser.parse(u'text http://u.tv/') self.assertEqual(result.html, u'text http://u.tv/') self.assertEqual(result.urls, [u'http://u.tv/']) - + def test_url_one_letter_iana(self): result = self.parser.parse(u'text http://x.com/') self.assertEqual(result.html, u'text http://x.com/') self.assertEqual(result.urls, [u'http://x.com/']) - + result = self.parser.parse(u'text http://Q.com/') self.assertEqual(result.html, u'text http://Q.com/') self.assertEqual(result.urls, [u'http://Q.com/']) - + result = self.parser.parse(u'text http://z.com/') self.assertEqual(result.html, u'text http://z.com/') self.assertEqual(result.urls, [u'http://z.com/']) - + result = self.parser.parse(u'text http://i.net/') self.assertEqual(result.html, u'text http://i.net/') self.assertEqual(result.urls, [u'http://i.net/']) - + result = self.parser.parse(u'text http://q.net/') self.assertEqual(result.html, u'text http://q.net/') self.assertEqual(result.urls, [u'http://q.net/']) - + result = self.parser.parse(u'text http://X.org/') self.assertEqual(result.html, u'text http://X.org/') self.assertEqual(result.urls, [u'http://X.org/']) - + def test_url_long_hypens(self): result = self.parser.parse(u'text http://word-and-a-number-8-ftw.domain.tld/') self.assertEqual(result.html, u'text http://word-and-a-number-8-...') self.assertEqual(result.urls, [u'http://word-and-a-number-8-ftw.domain.tld/']) - - + # URL not tests ------------------------------------------------------------ def test_not_url_dotdotdot(self): result = self.parser.parse(u'Is www...foo a valid URL?') self.assertEqual(result.html, u'Is www...foo a valid URL?') self.assertEqual(result.urls, []) - + def test_not_url_dash(self): result = self.parser.parse(u'Is www.-foo.com a valid URL?') self.assertEqual(result.html, u'Is www.-foo.com a valid URL?') self.assertEqual(result.urls, []) - + def test_not_url_no_tld(self): result = self.parser.parse(u'Is http://no-tld a valid URL?') self.assertEqual(result.html, u'Is http://no-tld a valid URL?') self.assertEqual(result.urls, []) - + def test_not_url_tld_too_short(self): result = self.parser.parse(u'Is http://tld-too-short.x a valid URL?') self.assertEqual(result.html, u'Is http://tld-too-short.x a valid URL?') self.assertEqual(result.urls, []) - - def test_all_not_break_url_at(self): + + def test_all_not_break_url_at2(self): result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406') self.assertEqual(result.html, u'http://www.flickr.com/photo...') self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406']) - + def test_not_url_one_letter_iana(self): result = self.parser.parse(u'text http://a.com/ http://a.net/ http://a.org/') self.assertEqual(result.html, u'text http://a.com/ http://a.net/ http://a.org/') self.assertEqual(result.urls, []) - - + # URL followed Tests ------------------------------------------------------- def test_url_followed_question(self): result = self.parser.parse(u'text http://example.com?') self.assertEqual(result.html, u'text http://example.com?') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_colon(self): result = self.parser.parse(u'text http://example.com:') self.assertEqual(result.html, u'text http://example.com:') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_curly_brace(self): result = self.parser.parse(u'text http://example.com}') self.assertEqual(result.html, u'text http://example.com}') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_single_quote(self): result = self.parser.parse(u'text http://example.com') self.assertEqual(result.html, u'text http://example.com') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_dot(self): result = self.parser.parse(u'text http://example.com.') self.assertEqual(result.html, u'text http://example.com.') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_exclamation(self): result = self.parser.parse(u'text http://example.com!') self.assertEqual(result.html, u'text http://example.com!') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_comma(self): result = self.parser.parse(u'text http://example.com,') self.assertEqual(result.html, u'text http://example.com,') self.assertEqual(result.urls, [u'http://example.com']) - + + def test_url_with_path_preceeded_by_comma(self): + result = self.parser.parse(u'text ,http://example.com/abcde, more') + self.assertEqual(result.html, u'text ,http://example.com/abcde, more') + self.assertEqual(result.urls, [u'http://example.com/abcde']) + + def test_url_with_path_followed_comma(self): + result = self.parser.parse(u'text http://example.com/abcde, more') + self.assertEqual(result.html, u'text http://example.com/abcde, more') + self.assertEqual(result.urls, [u'http://example.com/abcde']) + + def test_url_with_path_followed_commas(self): + result = self.parser.parse(u'text http://example.com/abcde,, more') + self.assertEqual(result.html, u'text http://example.com/abcde,, more') + self.assertEqual(result.urls, [u'http://example.com/abcde']) + def test_url_followed_brace(self): result = self.parser.parse(u'text http://example.com)') self.assertEqual(result.html, u'text http://example.com)') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_big_brace(self): result = self.parser.parse(u'text http://example.com]') self.assertEqual(result.html, u'text http://example.com]') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_equals(self): result = self.parser.parse(u'text http://example.com=') self.assertEqual(result.html, u'text http://example.com=') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_semicolon(self): result = self.parser.parse(u'text http://example.com;') self.assertEqual(result.html, u'text http://example.com;') self.assertEqual(result.urls, [u'http://example.com']) - + def test_url_followed_hypen(self): result = self.parser.parse(u'text http://domain.tld-that-you-should-have-put-a-space-after') self.assertEqual(result.html, u'text http://domain.tld-that-you-should-have-put-a-space-after') self.assertEqual(result.urls, [u'http://domain.tld']) - - + # URL preceeded Tests ------------------------------------------------------- def test_url_preceeded_colon(self): result = self.parser.parse(u'text:http://example.com') self.assertEqual(result.html, u'text:http://example.com') self.assertEqual(result.urls, [u'http://example.com']) - + def test_not_url_preceeded_equals(self): result = self.parser.parse(u'text =http://example.com') self.assertEqual(result.html, u'text =http://example.com') self.assertEqual(result.urls, []) - + # NOT def test_not_url_preceeded_forwardslash(self): result = self.parser.parse(u'text /http://example.com') self.assertEqual(result.html, u'text /http://example.com') self.assertEqual(result.urls, []) - + def test_not_url_preceeded_exclamation(self): result = self.parser.parse(u'text !http://example.com') self.assertEqual(result.html, u'text !http://example.com') self.assertEqual(result.urls, []) - - + # URL numeric tests -------------------------------------------------------- def test_url_at_numeric(self): result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406') self.assertEqual(result.html, u'http://www.flickr.com/photo...') self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406']) - + def test_url_at_non_numeric(self): result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/foobar') self.assertEqual(result.html, u'http://www.flickr.com/photo...') self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/foobar']) - - + # URL domain tests --------------------------------------------------------- def test_url_WWW(self): result = self.parser.parse(u'WWW.EXAMPLE.COM') self.assertEqual(result.html, u'WWW.EXAMPLE.COM') self.assertEqual(result.urls, [u'WWW.EXAMPLE.COM']) - + def test_url_www(self): result = self.parser.parse(u'www.example.com') self.assertEqual(result.html, u'www.example.com') self.assertEqual(result.urls, [u'www.example.com']) - + def test_url_only_domain_query_followed_period(self): result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.i.want.it. Even when they contain a URL.') - self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.... Even when they contain a URL.') + self.assertEqual( + result.html, u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.... Even when they contain a URL.') self.assertEqual(result.urls, [u'http://tell.me/why?=because.i.want.it']) - + def test_url_only_domain_followed_period(self): result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me. Even when they contain a URL.') self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period http://tell.me. Even when they contain a URL.') self.assertEqual(result.urls, [u'http://tell.me']) - + def test_url_only_domain_path_followed_period(self): result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why. Even when they contain a URL.') self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period http://tell.me/why. Even when they contain a URL.') self.assertEqual(result.urls, [u'http://tell.me/why']) - + def test_url_long_tld(self): result = self.parser.parse(u'http://example.mobi/path') self.assertEqual(result.html, u'http://example.mobi/path') self.assertEqual(result.urls, [u'http://example.mobi/path']) - + def test_url_multiple_protocols(self): result = self.parser.parse(u'http://foo.com AND https://bar.com AND www.foobar.com') self.assertEqual(result.html, u'http://foo.com AND https://bar.com AND www.foobar.com') self.assertEqual(result.urls, [u'http://foo.com', u'https://bar.com', u'www.foobar.com']) - + # NOT def test_not_url_exclamation_domain(self): result = self.parser.parse(u'badly formatted http://foo!bar.com') self.assertEqual(result.html, u'badly formatted http://foo!bar.com') self.assertEqual(result.urls, []) - + def test_not_url_under_domain(self): result = self.parser.parse(u'badly formatted http://foo_bar.com') self.assertEqual(result.html, u'badly formatted http://foo_bar.com') self.assertEqual(result.urls, []) - - + # Hashtag tests ------------------------------------------------------------ # -------------------------------------------------------------------------- def test_hashtag_followed_full_whitespace(self): result = self.parser.parse(u'#hashtag text') self.assertEqual(result.html, u'#hashtag text') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_followed_full_hash(self): result = self.parser.parse(u'#hashtag') self.assertEqual(result.html, u'#hashtag') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_preceeded_full_whitespace(self): result = self.parser.parse(u'text #hashtag') self.assertEqual(result.html, u'text #hashtag') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_number(self): result = self.parser.parse(u'text #1tag') self.assertEqual(result.html, u'text #1tag') self.assertEqual(result.tags, [u'1tag']) - + def test_not_hashtag_escape(self): result = self.parser.parse(u'&#nbsp;') self.assertEqual(result.html, u'&#nbsp;') self.assertEqual(result.tags, []) - + def test_hashtag_japanese(self): result = self.parser.parse(u'text #hashtagの') self.assertEqual(result.html, u'text #hashtagの') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_period(self): result = self.parser.parse(u'text.#hashtag') self.assertEqual(result.html, u'text.#hashtag') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_trailing(self): result = self.parser.parse(u'text #hashtag') self.assertEqual(result.html, u'text #hashtag') self.assertEqual(result.tags, [u'hashtag']) - + def test_not_hashtag_exclamation(self): result = self.parser.parse(u'text #hashtag!') self.assertEqual(result.html, u'text #hashtag!') self.assertEqual(result.tags, [u'hashtag']) - + def test_hashtag_multiple(self): result = self.parser.parse(u'text #hashtag1 #hashtag2') self.assertEqual(result.html, u'text #hashtag1 #hashtag2') self.assertEqual(result.tags, [u'hashtag1', u'hashtag2']) - + def test_not_hashtag_number(self): result = self.parser.parse(u'text #1234') self.assertEqual(result.html, u'text #1234') self.assertEqual(result.tags, []) - + def test_not_hashtag_text(self): result = self.parser.parse(u'text#hashtag') self.assertEqual(result.html, u'text#hashtag') self.assertEqual(result.tags, []) - + def test_hashtag_umlaut(self): result = self.parser.parse(u'text #hash_tagüäö') self.assertEqual(result.html, u'text #hash_tagüäö') self.assertEqual(result.tags, [u'hash_tag\xfc\xe4\xf6']) - + def test_hashtag_alpha(self): result = self.parser.parse(u'text #hash0tag') self.assertEqual(result.html, u'text #hash0tag') self.assertEqual(result.tags, [u'hash0tag']) - + def test_hashtag_under(self): result = self.parser.parse(u'text #hash_tag') self.assertEqual(result.html, u'text #hash_tag') self.assertEqual(result.tags, [u'hash_tag']) - - + # Username tests ----------------------------------------------------------- # -------------------------------------------------------------------------- def test_not_username_preceded_letter(self): result = self.parser.parse(u'meet@the beach') self.assertEqual(result.html, u'meet@the beach') self.assertEqual(result.users, []) - + def test_username_preceded_punctuation(self): result = self.parser.parse(u'.@username') self.assertEqual(result.html, u'.@username') self.assertEqual(result.users, [u'username']) - + def test_username_preceded_japanese(self): result = self.parser.parse(u'あ@username') self.assertEqual(result.html, u'あ@username') self.assertEqual(result.users, [u'username']) - + def test_username_followed_japanese(self): result = self.parser.parse(u'@usernameの') self.assertEqual(result.html, u'@usernameの') self.assertEqual(result.users, [u'username']) - + def test_username_surrounded_japanese(self): result = self.parser.parse(u'あ@usernameの') self.assertEqual(result.html, u'あ@usernameの') self.assertEqual(result.users, [u'username']) - + def test_username_followed_punctuation(self): result = self.parser.parse(u'@username&^$%^') self.assertEqual(result.html, u'@username&^$%^') self.assertEqual(result.users, [u'username']) - + def test_not_username_spaced(self): result = self.parser.parse(u'@ username') self.assertEqual(result.html, u'@ username') self.assertEqual(result.users, []) - + def test_username_beginning(self): result = self.parser.parse(u'@username text') self.assertEqual(result.html, u'@username text') self.assertEqual(result.users, [u'username']) - + def test_username_to_long(self): result = self.parser.parse(u'@username9012345678901') self.assertEqual(result.html, u'@username9012345678901') self.assertEqual(result.users, [u'username901234567890']) - + def test_username_full_at_sign(self): result = self.parser.parse(u'@username') self.assertEqual(result.html, u'@username') self.assertEqual(result.users, [u'username']) - + def test_username_trailing(self): result = self.parser.parse(u'text @username') self.assertEqual(result.html, u'text @username') self.assertEqual(result.users, [u'username']) - + # Replies def test_username_reply_simple(self): result = self.parser.parse(u'@username') self.assertEqual(result.html, u'@username') self.assertEqual(result.users, [u'username']) self.assertEqual(result.reply, u'username') - + def test_username_reply_whitespace(self): result = self.parser.parse(u' @username') self.assertEqual(result.html, u' @username') self.assertEqual(result.users, [u'username']) self.assertEqual(result.reply, u'username') - + def test_username_reply_full(self): result = self.parser.parse(u' @username') self.assertEqual(result.html, u' @username') self.assertEqual(result.users, [u'username']) self.assertEqual(result.reply, u'username') - + def test_username_non_reply(self): result = self.parser.parse(u'test @username') self.assertEqual(result.html, u'test @username') self.assertEqual(result.users, [u'username']) self.assertEqual(result.reply, None) - - + # List tests --------------------------------------------------------------- # -------------------------------------------------------------------------- def test_list_preceeded(self): result = self.parser.parse(u'text @username/list') self.assertEqual(result.html, u'text @username/list') self.assertEqual(result.lists, [(u'username', u'list')]) - + def test_list_beginning(self): result = self.parser.parse(u'@username/list') self.assertEqual(result.html, u'@username/list') self.assertEqual(result.lists, [(u'username', u'list')]) - + def test_list_preceeded_punctuation(self): result = self.parser.parse(u'.@username/list') self.assertEqual(result.html, u'.@username/list') self.assertEqual(result.lists, [(u'username', u'list')]) - + def test_list_followed_punctuation(self): result = self.parser.parse(u'@username/list&^$%^') self.assertEqual(result.html, u'@username/list&^$%^') self.assertEqual(result.lists, [(u'username', u'list')]) - + def test_list_not_slash_space(self): result = self.parser.parse(u'@username/ list') self.assertEqual(result.html, u'@username/ list') self.assertEqual(result.users, [u'username']) self.assertEqual(result.lists, []) - - def test_list_beginning(self): + + def test_list_beginning2(self): result = self.parser.parse(u'@username/list') self.assertEqual(result.html, u'@username/list') self.assertEqual(result.lists, [(u'username', u'list')]) - + def test_list_not_empty_username(self): result = self.parser.parse(u'text @/list') self.assertEqual(result.html, u'text @/list') self.assertEqual(result.lists, []) - + def test_list_not_preceeded_letter(self): result = self.parser.parse(u'meet@the/beach') self.assertEqual(result.html, u'meet@the/beach') self.assertEqual(result.lists, []) - + def test_list_long_truncate(self): result = self.parser.parse(u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A') - self.assertEqual(result.html, u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A') + self.assertEqual( + result.html, u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A') self.assertEqual(result.lists, [(u'username', u'list5678901234567890123456789012345678901234567890123456789012345678901234567890')]) - + def test_list_with_dash(self): result = self.parser.parse(u'text @username/list-foo') self.assertEqual(result.html, u'text @username/list-foo') self.assertEqual(result.lists, [(u'username', u'list-foo')]) +class TWPTestsWithSpans(unittest.TestCase): + + """Test ttp with re spans to extract character co-ords of matches""" + def setUp(self): + self.parser = ttp.Parser(include_spans=True) + + def test_spans_in_tweets(self): + """Test some coca-cola tweets taken from twitter with spans""" + result = self.parser.parse(u'Coca-Cola Hits 50 Million Facebook Likes http://bit.ly/QlKOc7') + self.assertEqual(result.urls, [('http://bit.ly/QlKOc7', (41, 61))]) + + result = self.parser.parse(u' #ABillionReasonsToBelieveInAfrica ARISE MAG.FASHION WEEK NY! Tsemaye B,Maki Oh,Tiffany Amber, Ozwald.Showin NY reasons2beliv @CocaCola_NG', html=False) + self.assertEqual(result.urls, []) + self.assertEqual(result.tags, [(u'ABillionReasonsToBelieveInAfrica', (1, 34))]) + self.assertEqual(result.users, [(u'CocaCola_NG', (126, 138))]) + + result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA', html=False) + self.assertEqual(result.urls, [(u'http://bit.ly/EANCAA', (95, 115))]) + self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72))]) + self.assertEqual(result.tags, [(u'GameOn', (76, 83)), (u'ad', (84, 87))]) + + def test_users_in_tweets(self): + result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA @someone', html=False) + self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72)), (u'someone', (116, 124))]) + + def test_edge_cases(self): + """Some edge cases that upset the original version of ttp""" + result = self.parser.parse(u' @user', html=False) + self.assertEqual(result.users, [(u'user', (1, 6))]) + + result = self.parser.parse(u' #hash ', html=False) + self.assertEqual(result.tags, [(u'hash', (1, 6))]) + + result = self.parser.parse(u' http://some.com ', html=False) + self.assertEqual(result.urls, [(u'http://some.com', (1, 16))]) + + # Test it! if __name__ == '__main__': unittest.main() + # verbosity = 0 # set to 2 for verbose output + # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpansEdgeCases) + # unittest.TextTestRunner(verbosity=verbosity).run(suite) + # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans) + # unittest.TextTestRunner(verbosity=verbosity).run(suite) + # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests) + # unittest.TextTestRunner(verbosity=verbosity).run(suite) diff --git a/ttp.py b/ttp/ttp.py similarity index 82% rename from ttp.py rename to ttp/ttp.py index 27102a9..ac7c79e 100644 --- a/ttp.py +++ b/ttp/ttp.py @@ -13,14 +13,18 @@ # You should have received a copy of the GNU General Public License along with # twitter-text-python. If not, see . -# TODO create a setup.py - +# Forked by Ian Ozsvald: +# https://github.com/ianozsvald/twitter-text-python +# from: +# https://github.com/BonsaiDen/twitter-text-python # Tweet Parser and Formatter --------------------------------------------------- # ------------------------------------------------------------------------------ import re import urllib +__version__ = "1.0.1.0" + # Some of this code has been translated from the twitter-text-java library: # AT_SIGNS = ur'[@\uff20]' @@ -35,8 +39,8 @@ # Users USERNAME_REGEX = re.compile(ur'\B' + AT_SIGNS + LIST_END_CHARS, re.IGNORECASE) -REPLY_REGEX = re.compile(ur'^(?:' + SPACES + ur')*' + AT_SIGNS \ - + ur'([a-z0-9_]{1,20}).*', re.IGNORECASE) +REPLY_REGEX = re.compile(ur'^(?:' + SPACES + ur')*' + AT_SIGNS + + ur'([a-z0-9_]{1,20}).*', re.IGNORECASE) # Hashtags HASHTAG_EXP = ur'(^|[^0-9A-Z&/]+)(#|\uff03)([0-9A-Z_]*[A-Z_]+[%s]*)' % UTF_CHARS @@ -55,76 +59,78 @@ PATH_ENDING_CHARS = r'[%s\)=#/]' % UTF_CHARS QUERY_ENDING_CHARS = '[a-z0-9_&=#]' -URL_REGEX = re.compile('((%s)((https?://|www\\.)(%s)(\/%s*%s?)?(\?%s*%s)?))' +URL_REGEX = re.compile('((%s)((https?://|www\\.)(%s)(\/(%s*%s)?)?(\?%s*%s)?))' % (PRE_CHARS, DOMAIN_CHARS, PATH_CHARS, PATH_ENDING_CHARS, QUERY_CHARS, QUERY_ENDING_CHARS), - re.IGNORECASE) - + re.IGNORECASE) # Registered IANA one letter domains IANA_ONE_LETTER_DOMAINS = ('x.com', 'x.org', 'z.com', 'q.net', 'q.com', 'i.net') class ParseResult(object): + '''A class containing the results of a parsed Tweet. - + Attributes: - urls: A list containing all the valid urls in the Tweet. - + - users A list containing all the valid usernames in the Tweet. - + - reply A string containing the username this tweet was a reply to. This only matches a username at the beginning of the Tweet, it may however be preceeded by whitespace. Note: It's generally better to rely on the Tweet JSON/XML in order to find out if it's a reply or not. - + - lists A list containing all the valid lists in the Tweet. Each list item is a tuple in the format (username, listname). - + - tags A list containing all the valid tags in theTweet. - + - html A string containg formatted HTML. To change the formatting sublcass twp.Parser and override the format_* methods. - + ''' - + def __init__(self, urls, users, reply, lists, tags, html): - self.urls = list(set(urls)) if urls else [] #fixes dups - self.users = list(set(users)) if users else [] - self.lists = list(set(lists)) if lists else [] - self.reply = list(set(reply)) if reply else [] - self.tags = list(set(tags)) if tags else [] + self.urls = urls if urls else [] + self.users = users if users else [] + self.lists = lists if lists else [] + self.reply = reply if reply else None + self.tags = tags if tags else [] self.html = html class Parser(object): + '''A Tweet Parser''' - - def __init__(self, max_url_length=30): + + def __init__(self, max_url_length=30, include_spans=False): self._max_url_length = max_url_length - + self._include_spans = include_spans + def parse(self, text, html=True): '''Parse the text and return a ParseResult instance.''' self._urls = [] self._users = [] self._lists = [] self._tags = [] - + reply = REPLY_REGEX.match(text) reply = reply.groups(0)[0] if reply is not None else None - + parsed_html = self._html(text) if html else self._text(text) return ParseResult(self._urls, self._users, reply, self._lists, self._tags, parsed_html) - + def _text(self, text): '''Parse a Tweet without generating HTML.''' URL_REGEX.sub(self._parse_urls, text) @@ -132,84 +138,95 @@ def _text(self, text): LIST_REGEX.sub(self._parse_lists, text) HASHTAG_REGEX.sub(self._parse_tags, text) return None - + def _html(self, text): '''Parse a Tweet and generate HTML.''' html = URL_REGEX.sub(self._parse_urls, text) html = USERNAME_REGEX.sub(self._parse_users, html) html = LIST_REGEX.sub(self._parse_lists, html) return HASHTAG_REGEX.sub(self._parse_tags, html) - - + # Internal parser stuff ---------------------------------------------------- def _parse_urls(self, match): '''Parse URLs.''' - + mat = match.group(0) - + # Fix a bug in the regex concerning www...com and www.-foo.com domains # TODO fix this in the regex instead of working around it here domain = match.group(5) if domain[0] in '.-': return mat - + # Only allow IANA one letter domains that are actually registered if len(domain) == 5 \ and domain[-4:].lower() in ('.com', '.org', '.net') \ and not domain.lower() in IANA_ONE_LETTER_DOMAINS: - + return mat - + # Check for urls without http(s) pos = mat.find('http') if pos != -1: pre, url = mat[:pos], mat[pos:] full_url = url - + # Find the www and force http:// else: pos = mat.lower().find('www') pre, url = mat[:pos], mat[pos:] full_url = 'http://%s' % url - - self._urls.append(url) - + + if self._include_spans: + span = match.span(0) + # add an offset if pre is e.g. ' ' + span = (span[0] + len(pre), span[1]) + self._urls.append((url, span)) + else: + self._urls.append(url) + if self._html: return '%s%s' % (pre, self.format_url(full_url, - self._shorten_url(escape(url)))) - + self._shorten_url(escape(url)))) + def _parse_users(self, match): '''Parse usernames.''' - + # Don't parse lists here if match.group(2) is not None: return match.group(0) - + mat = match.group(0) - self._users.append(mat[1:]) - + if self._include_spans: + self._users.append((mat[1:], match.span(0))) + else: + self._users.append(mat[1:]) + if self._html: return self.format_username(mat[0:1], mat[1:]) - + def _parse_lists(self, match): '''Parse lists.''' - + # Don't parse usernames here if match.group(4) is None: return match.group(0) - + pre, at_char, user, list_name = match.groups() list_name = list_name[1:] - self._lists.append((user, list_name)) - + if self._include_spans: + self._lists.append((user, list_name, match.span(0))) + else: + self._lists.append((user, list_name)) + if self._html: return '%s%s' % (pre, self.format_list(at_char, user, list_name)) - + def _parse_tags(self, match): '''Parse hashtags.''' - + mat = match.group(0) - + # Fix problems with the regex capturing stuff infront of the # tag = None for i in u'#\uff03': @@ -217,45 +234,50 @@ def _parse_tags(self, match): if pos != -1: tag = i break - + pre, text = mat[:pos], mat[pos + 1:] - self._tags.append(text) - + if self._include_spans: + span = match.span(0) + # add an offset if pre is e.g. ' ' + span = (span[0] + len(pre), span[1]) + self._tags.append((text, span)) + else: + self._tags.append(text) + if self._html: return '%s%s' % (pre, self.format_tag(tag, text)) - + def _shorten_url(self, text): '''Shorten a URL and make sure to not cut of html entities.''' - + if len(text) > self._max_url_length and self._max_url_length != -1: text = text[0:self._max_url_length - 3] amp = text.rfind('&') close = text.rfind(';') if amp != -1 and (close == -1 or close < amp): text = text[0:amp] - + return text + '...' - + else: return text - - + # User defined formatters -------------------------------------------------- def format_tag(self, tag, text): '''Return formatted HTML for a hashtag.''' return '%s%s' \ - % (urllib.quote('#' + text.encode('utf-8')), tag, text) - + % (urllib.quote('#' + text.encode('utf-8')), tag, text) + def format_username(self, at_char, user): '''Return formatted HTML for a username.''' return '%s%s' \ % (user, at_char, user) - + def format_list(self, at_char, user, list_name): '''Return formatted HTML for a list.''' return '%s%s/%s' \ % (user, list_name, at_char, user, list_name) - + def format_url(self, url, text): '''Return formatted HTML for a url.''' return '%s' % (escape(url), text) @@ -267,4 +289,3 @@ def escape(text): return ''.join({'&': '&', '"': '"', '\'': ''', '>': '>', '<': '<'}.get(c, c) for c in text) - diff --git a/ttp/utils.py b/ttp/utils.py new file mode 100644 index 0000000..2c3d822 --- /dev/null +++ b/ttp/utils.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Unwind short-links e.g. bit.ly, t.co etc to their canonical links""" +import requests + + +def follow_shortlinks(shortlinks): + """Follow redirects in list of shortlinks, return dict of resulting URLs""" + links_followed = {} + for shortlink in shortlinks: + url = shortlink + request_result = requests.get(url) + redirect_history = request_result.history + # history might look like: + # (, ) + # where each response object has a URL + all_urls = [] + for redirect in redirect_history: + all_urls.append(redirect.url) + # append the final URL that we finish with + all_urls.append(request_result.url) + links_followed[shortlink] = all_urls + return links_followed + + +if __name__ == "__main__": + shortlinks = ['http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF'] + print follow_shortlinks(shortlinks)