diff --git a/README.rst b/README.rst
index 207d3a9..2ed8cf0 100644
--- a/README.rst
+++ b/README.rst
@@ -1,34 +1,94 @@
twitter-text-python
===================
-**twitter-text-python** is a Tweet parser and formatter for Python.
+**twitter-text-python** is a Tweet parser and formatter for Python. Extract users, hashtags, URLs and format as HTML for display.
-It is based on twitter-text-java_ and passes all the unittests of
-twitter-text-conformance_ plus some additional ones.
+----
+**UPDATE** this project is now maintained by Ed Burnett, please go here for the active version: https://github.com/edburnett/twitter-text-python
+----
+
+It is based on twitter-text-java_ and did pass all the unittests of
+twitter-text-conformance_ plus some additional ones. Note that the conformance tests are now behind (easy PR for someone to work on: https://github.com/ianozsvald/twitter-text-python/issues/5 ):
.. _twitter-text-java: http://github.com/mzsanford/twitter-text-java
.. _twitter-text-conformance: http://github.com/mzsanford/twitter-text-conformance
+This version was forked by Ian Ozsvald in January 2013 and released to PyPI, some bugs were fixed, a few minor changes to functionality added:
+https://github.com/ianozsvald/twitter-text-python
+
+PyPI release:
+http://pypi.python.org/pypi/twitter-text-python/
+
+The original ttp comes from Ivo Wetzel (Ivo's version no longer supported):
+https://github.com/BonsaiDen/twitter-text-python
+
Usage::
- >>> import ttp
+ >>> from ttp import ttp
>>> p = ttp.Parser()
- >>> result = p.parse("@BonsaiDen Hey that's a great Tweet parser! #twp")
+ >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/")
>>> result.reply
- 'BonsaiDen'
+ 'ianozsvald'
>>> result.users
- ['BonsaiDen']
+ ['ianozsvald']
>>> result.tags
- ['twp']
+ ['IvoWertzel']
>>> result.urls
- []
+ ['https://github.com/ianozsvald/']
>>> result.html
- u'@BonsaiDen Hey that\'s a great Tweet Parser!
- #twp'
-
+ u'@ianozsvald, you now support #IvoWertzel\'s tweet parser! https://github.com/ianozsvald/'
If you need different HTML output just subclass and override the ``format_*`` methods.
+You can also ask for the span tags to be returned for each entity::
+
+ >>> p = ttp.Parser(include_spans=True)
+ >>> result = p.parse("@ianozsvald, you now support #IvoWertzel's tweet parser! https://github.com/ianozsvald/")
+ >>> result.urls
+ [('https://github.com/ianozsvald/', (57, 87))]
+
+
+To use the shortlink follower:
+
+ >>> from ttp import utils
+ >>> # assume that result.urls == ['http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF']
+ >>> print utils.follow_shortlinks(result.urls) # pass in list of shortlink URLs
+ {'http://t.co/8o0z9BbEMu': [u'http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF', u'http://www.bbc.co.uk/sport/0/21711199#TWEET650562'], u'http://bbc.in/16dClPF': [u'http://bbc.in/16dClPF', u'http://www.bbc.co.uk/sport/0/21711199#TWEET650562']}
+ >>> # note that bad shortlink URLs have a key to an empty list (lost/forgotten shortlink URLs don't generate any error)
+
+
+Installation
+------------
+
+**NOTE** this version (Ian's) is no longer maintained, see Ed's active version instead: https://github.com/edburnett/twitter-text-python
+
+pip and easy_install will do the job::
+
+ # via: http://pypi.python.org/pypi/twitter-text-python
+ $ pip install twitter-text-python
+ $ python
+ >>> from ttp import ttp
+ >>> ttp.__version__
+ '1.0.0.2'
+
+Changelog
+---------
+
+ * 2013/2/11 1.0.0.2 released to PyPI
+ * 2013/6/1 1.0.1 new working version, adding comma parse fix (thanks https://github.com/muckrack), used autopep8 to clean the src, added a shortlink expander
+
+
+Tests
+-----
+
+Checkout the code via github https://github.com/ianozsvald/twitter-text-python and run tests locally::
+
+ $ python ttp/tests.py
+ ....................................................................................................
+ ----------------------------------------------------------------------
+ Ran 100 tests in 0.009s
+ OK
+
Contributing
------------
@@ -37,23 +97,53 @@ The source is available on GitHub_, to
contribute to the project, fork it on GitHub and send a pull request.
Everyone is welcome to make improvements to **twp**!
-.. _GitHub: http://github.com/BonsaiDen/twitter-text-python
+.. _GitHub: https://github.com/ianozsvald/twitter-text-python
+
+
+Todo
+----
+
+ * Consider adding capitalised phrase identification
+ * Consider adding a repeated-char remover (e.g. grrrrrrr->grr)
+ * Make it 1 line to parse and get a results dict via __init__.py
+ * Tag the next release
+
+Doing a release
+---------------
+
+In parent directory on Ian's machine see USE_THIS_FOR_PYPI_RELEASE.txt. The short form::
+
+ $ # edit setup.py to bump the version number
+ $ git tag -a v1.0.1 -m 'v1.0.1 release'
+ $ git push origin --tags
+ $ ianozsvald-twitter-text-python $ python setup.py sdist register upload -r http://pypi.python.org/pypi
+ $ # this uses ~/.pypirc with cached login details
+
License
-=======
+-------
+
+*MIT*
+
+Copyright (c) 2012 Ivo Wetzel.
-Copyright (c) 2010 Ivo Wetzel
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
-**twitter-text-python** is free software: you can redistribute it and/or
-modify it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
-**twitter-text-python** is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
-You should have received a copy of the GNU General Public License along with
-**twitter-text-python**. If not, see .
+Copyright (c) 2010-2013 Ivo Wetzel
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..6a99645
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+requests==1.1.0
diff --git a/setup.py b/setup.py
index 9de7d83..2a9dd84 100644
--- a/setup.py
+++ b/setup.py
@@ -2,24 +2,25 @@
setup(
name='twitter-text-python',
- version='1.0',
- description='Tweet parser and formatter',
- long_description=open('README.rst').read(),
- author='Ivo Wetzel',
- author_email='',
- url='http://github.com/BonsaiDen/twitter-text-python',
- license='GPL',
- py_modules=['ttp'],
+ version='1.0.1',
+ description='Twitter Tweet parser and formatter',
+ long_description="Extract @users, #hashtags and URLs (and unwind shortened links) from tweets including entity locations, also generate HTML for output. Visit https://github.com/ianozsvald/twitter-text-python for examples.",
+ #open('README.rst').read(),
+ author='Maintained by Ian Ozsvald (originally by Ivo Wetzel)',
+ author_email='ian@ianozsvald.com',
+ url='https://github.com/ianozsvald/twitter-text-python',
+ license='MIT',
+ packages=['ttp'],
include_package_data=True,
zip_safe=False,
install_requires=[],
classifiers=[
- 'Environment :: Web Environment',
- # I don't know what exactly this means, but why not?
+ 'Environment :: Console',
'Intended Audience :: Developers',
- 'License :: OSI Approved :: BSD License',
+ 'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
'Programming Language :: Python',
'Topic :: Software Development :: Libraries :: Python Modules',
+ 'Topic :: Text Processing :: Linguistic',
]
)
diff --git a/ttp/__init__.py b/ttp/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests.py b/ttp/tests.py
similarity index 85%
rename from tests.py
rename to ttp/tests.py
index e084abc..39aa5ab 100644
--- a/tests.py
+++ b/ttp/tests.py
@@ -22,534 +22,593 @@
class TWPTests(unittest.TestCase):
+
def setUp(self):
self.parser = ttp.Parser()
-
-
+
# General Tests ------------------------------------------------------------
# --------------------------------------------------------------------------
+ def test_urls(self):
+ """Confirm that # in a URL works along with ,"""
+ result = self.parser.parse(u'big url: http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2 #ahashtag')
+ self.assertEqual(result.urls, [u'http://blah.com:8080/path/to/here?p=1&q=abc,def#posn2'])
+ self.assertEqual(result.tags, [u'ahashtag'])
+
def test_all_not_allow_amp_without_question(self):
result = self.parser.parse(u'Check out: http://www.github.com/test&@username')
self.assertEqual(result.html, u'Check out: http://www.github.com/test&@username')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.urls, [u'http://www.github.com/test'])
-
+
def test_all_not_break_url_at(self):
result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406')
self.assertEqual(result.html, u'http://www.flickr.com/photo...')
self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406'])
-
-
+
# URL tests ----------------------------------------------------------------
# --------------------------------------------------------------------------
def test_url_mid(self):
result = self.parser.parse(u'text http://example.com more text')
self.assertEqual(result.html, u'text http://example.com more text')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_unicode(self):
result = self.parser.parse(u'I enjoy Macintosh Brand computers: http://✪df.ws/ejp')
self.assertEqual(result.html, u'I enjoy Macintosh Brand computers: http://✪df.ws/ejp')
self.assertEqual(result.urls, [u'http://\u272adf.ws/ejp'])
-
+
def test_url_parentheses(self):
result = self.parser.parse(u'text (http://example.com)')
self.assertEqual(result.html, u'text (http://example.com)')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_underscore(self):
result = self.parser.parse(u'text http://example.com/test/foo_123.jpg')
self.assertEqual(result.html, u'text http://example.com/test/foo...')
self.assertEqual(result.urls, [u'http://example.com/test/foo_123.jpg'])
-
+
def test_url_underscore_dot(self):
result = self.parser.parse(u'text http://example.com/test/bla.net_foo_123.jpg')
self.assertEqual(result.html, u'text http://example.com/test/bla...')
self.assertEqual(result.urls, [u'http://example.com/test/bla.net_foo_123.jpg'])
-
+
def test_url_amp_lang_equals(self):
result = self.parser.parse(u'Check out http://search.twitter.com/search?q=avro&lang=en')
self.assertEqual(result.html, u'Check out http://search.twitter.com/s...')
self.assertEqual(result.urls, [u'http://search.twitter.com/search?q=avro&lang=en'])
-
+
def test_url_amp_break(self):
result = self.parser.parse(u'Check out http://twitter.com/te?foo&invalid=True')
self.assertEqual(result.html, u'Check out http://twitter.com/te?foo...')
self.assertEqual(result.urls, [u'http://twitter.com/te?foo&invalid=True'])
-
+
def test_url_dash(self):
result = self.parser.parse(u'Is www.foo-bar.com a valid URL?')
self.assertEqual(result.html, u'Is www.foo-bar.com a valid URL?')
self.assertEqual(result.urls, [u'www.foo-bar.com'])
-
+
def test_url_multiple(self):
result = self.parser.parse(u'http://example.com https://sslexample.com http://sub.example.com')
- self.assertEqual(result.html, u'http://example.comhttps://sslexample.comhttp://sub.example.com')
+ self.assertEqual(
+ result.html, u'http://example.comhttps://sslexample.comhttp://sub.example.com')
self.assertEqual(result.urls, [u'http://example.com', u'https://sslexample.com', u'http://sub.example.com'])
-
+
def test_url_raw_domain(self):
result = self.parser.parse(u'See http://example.com example.com')
self.assertEqual(result.html, u'See http://example.com example.com')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_embed_link(self):
result = self.parser.parse(u'http://example.com')
self.assertEqual(result.html, u'http://example.com')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_trailing(self):
result = self.parser.parse(u'text http://example.com')
self.assertEqual(result.html, u'text http://example.com')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_japanese(self):
result = self.parser.parse(u'いまなにしてるhttp://example.comいまなにしてる')
self.assertEqual(result.html, u'いまなにしてるhttp://example.comいまなにしてる')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_lots_of_punctuation(self):
result = self.parser.parse(u'text http://xo.com/~matthew+%-,.;x')
self.assertEqual(result.html, u'text http://xo.com/~matthew+%-,.;x')
self.assertEqual(result.urls, [u'http://xo.com/~matthew+%-,.;x'])
-
+
def test_url_question_numbers(self):
result = self.parser.parse(u'text http://example.com/?77e8fd')
self.assertEqual(result.html, u'text http://example.com/?77e8fd')
self.assertEqual(result.urls, [u'http://example.com/?77e8fd'])
-
+
def test_url_one_letter_other(self):
result = self.parser.parse(u'text http://u.nu/')
self.assertEqual(result.html, u'text http://u.nu/')
self.assertEqual(result.urls, [u'http://u.nu/'])
-
+
result = self.parser.parse(u'text http://u.tv/')
self.assertEqual(result.html, u'text http://u.tv/')
self.assertEqual(result.urls, [u'http://u.tv/'])
-
+
def test_url_one_letter_iana(self):
result = self.parser.parse(u'text http://x.com/')
self.assertEqual(result.html, u'text http://x.com/')
self.assertEqual(result.urls, [u'http://x.com/'])
-
+
result = self.parser.parse(u'text http://Q.com/')
self.assertEqual(result.html, u'text http://Q.com/')
self.assertEqual(result.urls, [u'http://Q.com/'])
-
+
result = self.parser.parse(u'text http://z.com/')
self.assertEqual(result.html, u'text http://z.com/')
self.assertEqual(result.urls, [u'http://z.com/'])
-
+
result = self.parser.parse(u'text http://i.net/')
self.assertEqual(result.html, u'text http://i.net/')
self.assertEqual(result.urls, [u'http://i.net/'])
-
+
result = self.parser.parse(u'text http://q.net/')
self.assertEqual(result.html, u'text http://q.net/')
self.assertEqual(result.urls, [u'http://q.net/'])
-
+
result = self.parser.parse(u'text http://X.org/')
self.assertEqual(result.html, u'text http://X.org/')
self.assertEqual(result.urls, [u'http://X.org/'])
-
+
def test_url_long_hypens(self):
result = self.parser.parse(u'text http://word-and-a-number-8-ftw.domain.tld/')
self.assertEqual(result.html, u'text http://word-and-a-number-8-...')
self.assertEqual(result.urls, [u'http://word-and-a-number-8-ftw.domain.tld/'])
-
-
+
# URL not tests ------------------------------------------------------------
def test_not_url_dotdotdot(self):
result = self.parser.parse(u'Is www...foo a valid URL?')
self.assertEqual(result.html, u'Is www...foo a valid URL?')
self.assertEqual(result.urls, [])
-
+
def test_not_url_dash(self):
result = self.parser.parse(u'Is www.-foo.com a valid URL?')
self.assertEqual(result.html, u'Is www.-foo.com a valid URL?')
self.assertEqual(result.urls, [])
-
+
def test_not_url_no_tld(self):
result = self.parser.parse(u'Is http://no-tld a valid URL?')
self.assertEqual(result.html, u'Is http://no-tld a valid URL?')
self.assertEqual(result.urls, [])
-
+
def test_not_url_tld_too_short(self):
result = self.parser.parse(u'Is http://tld-too-short.x a valid URL?')
self.assertEqual(result.html, u'Is http://tld-too-short.x a valid URL?')
self.assertEqual(result.urls, [])
-
- def test_all_not_break_url_at(self):
+
+ def test_all_not_break_url_at2(self):
result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406')
self.assertEqual(result.html, u'http://www.flickr.com/photo...')
self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406'])
-
+
def test_not_url_one_letter_iana(self):
result = self.parser.parse(u'text http://a.com/ http://a.net/ http://a.org/')
self.assertEqual(result.html, u'text http://a.com/ http://a.net/ http://a.org/')
self.assertEqual(result.urls, [])
-
-
+
# URL followed Tests -------------------------------------------------------
def test_url_followed_question(self):
result = self.parser.parse(u'text http://example.com?')
self.assertEqual(result.html, u'text http://example.com?')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_followed_colon(self):
result = self.parser.parse(u'text http://example.com:')
self.assertEqual(result.html, u'text http://example.com:')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_followed_curly_brace(self):
result = self.parser.parse(u'text http://example.com}')
self.assertEqual(result.html, u'text http://example.com}')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_followed_single_quote(self):
result = self.parser.parse(u'text http://example.com')
self.assertEqual(result.html, u'text http://example.com')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_followed_dot(self):
result = self.parser.parse(u'text http://example.com.')
self.assertEqual(result.html, u'text http://example.com.')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_followed_exclamation(self):
result = self.parser.parse(u'text http://example.com!')
self.assertEqual(result.html, u'text http://example.com!')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_followed_comma(self):
result = self.parser.parse(u'text http://example.com,')
self.assertEqual(result.html, u'text http://example.com,')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
+ def test_url_with_path_preceeded_by_comma(self):
+ result = self.parser.parse(u'text ,http://example.com/abcde, more')
+ self.assertEqual(result.html, u'text ,http://example.com/abcde, more')
+ self.assertEqual(result.urls, [u'http://example.com/abcde'])
+
+ def test_url_with_path_followed_comma(self):
+ result = self.parser.parse(u'text http://example.com/abcde, more')
+ self.assertEqual(result.html, u'text http://example.com/abcde, more')
+ self.assertEqual(result.urls, [u'http://example.com/abcde'])
+
+ def test_url_with_path_followed_commas(self):
+ result = self.parser.parse(u'text http://example.com/abcde,, more')
+ self.assertEqual(result.html, u'text http://example.com/abcde,, more')
+ self.assertEqual(result.urls, [u'http://example.com/abcde'])
+
def test_url_followed_brace(self):
result = self.parser.parse(u'text http://example.com)')
self.assertEqual(result.html, u'text http://example.com)')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_followed_big_brace(self):
result = self.parser.parse(u'text http://example.com]')
self.assertEqual(result.html, u'text http://example.com]')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_followed_equals(self):
result = self.parser.parse(u'text http://example.com=')
self.assertEqual(result.html, u'text http://example.com=')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_followed_semicolon(self):
result = self.parser.parse(u'text http://example.com;')
self.assertEqual(result.html, u'text http://example.com;')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_url_followed_hypen(self):
result = self.parser.parse(u'text http://domain.tld-that-you-should-have-put-a-space-after')
self.assertEqual(result.html, u'text http://domain.tld-that-you-should-have-put-a-space-after')
self.assertEqual(result.urls, [u'http://domain.tld'])
-
-
+
# URL preceeded Tests -------------------------------------------------------
def test_url_preceeded_colon(self):
result = self.parser.parse(u'text:http://example.com')
self.assertEqual(result.html, u'text:http://example.com')
self.assertEqual(result.urls, [u'http://example.com'])
-
+
def test_not_url_preceeded_equals(self):
result = self.parser.parse(u'text =http://example.com')
self.assertEqual(result.html, u'text =http://example.com')
self.assertEqual(result.urls, [])
-
+
# NOT
def test_not_url_preceeded_forwardslash(self):
result = self.parser.parse(u'text /http://example.com')
self.assertEqual(result.html, u'text /http://example.com')
self.assertEqual(result.urls, [])
-
+
def test_not_url_preceeded_exclamation(self):
result = self.parser.parse(u'text !http://example.com')
self.assertEqual(result.html, u'text !http://example.com')
self.assertEqual(result.urls, [])
-
-
+
# URL numeric tests --------------------------------------------------------
def test_url_at_numeric(self):
result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/4382024406')
self.assertEqual(result.html, u'http://www.flickr.com/photo...')
self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/4382024406'])
-
+
def test_url_at_non_numeric(self):
result = self.parser.parse(u'http://www.flickr.com/photos/29674651@N00/foobar')
self.assertEqual(result.html, u'http://www.flickr.com/photo...')
self.assertEqual(result.urls, [u'http://www.flickr.com/photos/29674651@N00/foobar'])
-
-
+
# URL domain tests ---------------------------------------------------------
def test_url_WWW(self):
result = self.parser.parse(u'WWW.EXAMPLE.COM')
self.assertEqual(result.html, u'WWW.EXAMPLE.COM')
self.assertEqual(result.urls, [u'WWW.EXAMPLE.COM'])
-
+
def test_url_www(self):
result = self.parser.parse(u'www.example.com')
self.assertEqual(result.html, u'www.example.com')
self.assertEqual(result.urls, [u'www.example.com'])
-
+
def test_url_only_domain_query_followed_period(self):
result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.i.want.it. Even when they contain a URL.')
- self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.... Even when they contain a URL.')
+ self.assertEqual(
+ result.html, u'I think it\'s proper to end sentences with a period http://tell.me/why?=because.... Even when they contain a URL.')
self.assertEqual(result.urls, [u'http://tell.me/why?=because.i.want.it'])
-
+
def test_url_only_domain_followed_period(self):
result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me. Even when they contain a URL.')
self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period http://tell.me. Even when they contain a URL.')
self.assertEqual(result.urls, [u'http://tell.me'])
-
+
def test_url_only_domain_path_followed_period(self):
result = self.parser.parse(u'I think it\'s proper to end sentences with a period http://tell.me/why. Even when they contain a URL.')
self.assertEqual(result.html, u'I think it\'s proper to end sentences with a period http://tell.me/why. Even when they contain a URL.')
self.assertEqual(result.urls, [u'http://tell.me/why'])
-
+
def test_url_long_tld(self):
result = self.parser.parse(u'http://example.mobi/path')
self.assertEqual(result.html, u'http://example.mobi/path')
self.assertEqual(result.urls, [u'http://example.mobi/path'])
-
+
def test_url_multiple_protocols(self):
result = self.parser.parse(u'http://foo.com AND https://bar.com AND www.foobar.com')
self.assertEqual(result.html, u'http://foo.com AND https://bar.com AND www.foobar.com')
self.assertEqual(result.urls, [u'http://foo.com', u'https://bar.com', u'www.foobar.com'])
-
+
# NOT
def test_not_url_exclamation_domain(self):
result = self.parser.parse(u'badly formatted http://foo!bar.com')
self.assertEqual(result.html, u'badly formatted http://foo!bar.com')
self.assertEqual(result.urls, [])
-
+
def test_not_url_under_domain(self):
result = self.parser.parse(u'badly formatted http://foo_bar.com')
self.assertEqual(result.html, u'badly formatted http://foo_bar.com')
self.assertEqual(result.urls, [])
-
-
+
# Hashtag tests ------------------------------------------------------------
# --------------------------------------------------------------------------
def test_hashtag_followed_full_whitespace(self):
result = self.parser.parse(u'#hashtag text')
self.assertEqual(result.html, u'#hashtag text')
self.assertEqual(result.tags, [u'hashtag'])
-
+
def test_hashtag_followed_full_hash(self):
result = self.parser.parse(u'#hashtag')
self.assertEqual(result.html, u'#hashtag')
self.assertEqual(result.tags, [u'hashtag'])
-
+
def test_hashtag_preceeded_full_whitespace(self):
result = self.parser.parse(u'text #hashtag')
self.assertEqual(result.html, u'text #hashtag')
self.assertEqual(result.tags, [u'hashtag'])
-
+
def test_hashtag_number(self):
result = self.parser.parse(u'text #1tag')
self.assertEqual(result.html, u'text #1tag')
self.assertEqual(result.tags, [u'1tag'])
-
+
def test_not_hashtag_escape(self):
result = self.parser.parse(u'nbsp;')
self.assertEqual(result.html, u'nbsp;')
self.assertEqual(result.tags, [])
-
+
def test_hashtag_japanese(self):
result = self.parser.parse(u'text #hashtagの')
self.assertEqual(result.html, u'text #hashtagの')
self.assertEqual(result.tags, [u'hashtag'])
-
+
def test_hashtag_period(self):
result = self.parser.parse(u'text.#hashtag')
self.assertEqual(result.html, u'text.#hashtag')
self.assertEqual(result.tags, [u'hashtag'])
-
+
def test_hashtag_trailing(self):
result = self.parser.parse(u'text #hashtag')
self.assertEqual(result.html, u'text #hashtag')
self.assertEqual(result.tags, [u'hashtag'])
-
+
def test_not_hashtag_exclamation(self):
result = self.parser.parse(u'text #hashtag!')
self.assertEqual(result.html, u'text #hashtag!')
self.assertEqual(result.tags, [u'hashtag'])
-
+
def test_hashtag_multiple(self):
result = self.parser.parse(u'text #hashtag1 #hashtag2')
self.assertEqual(result.html, u'text #hashtag1#hashtag2')
self.assertEqual(result.tags, [u'hashtag1', u'hashtag2'])
-
+
def test_not_hashtag_number(self):
result = self.parser.parse(u'text #1234')
self.assertEqual(result.html, u'text #1234')
self.assertEqual(result.tags, [])
-
+
def test_not_hashtag_text(self):
result = self.parser.parse(u'text#hashtag')
self.assertEqual(result.html, u'text#hashtag')
self.assertEqual(result.tags, [])
-
+
def test_hashtag_umlaut(self):
result = self.parser.parse(u'text #hash_tagüäö')
self.assertEqual(result.html, u'text #hash_tagüäö')
self.assertEqual(result.tags, [u'hash_tag\xfc\xe4\xf6'])
-
+
def test_hashtag_alpha(self):
result = self.parser.parse(u'text #hash0tag')
self.assertEqual(result.html, u'text #hash0tag')
self.assertEqual(result.tags, [u'hash0tag'])
-
+
def test_hashtag_under(self):
result = self.parser.parse(u'text #hash_tag')
self.assertEqual(result.html, u'text #hash_tag')
self.assertEqual(result.tags, [u'hash_tag'])
-
-
+
# Username tests -----------------------------------------------------------
# --------------------------------------------------------------------------
def test_not_username_preceded_letter(self):
result = self.parser.parse(u'meet@the beach')
self.assertEqual(result.html, u'meet@the beach')
self.assertEqual(result.users, [])
-
+
def test_username_preceded_punctuation(self):
result = self.parser.parse(u'.@username')
self.assertEqual(result.html, u'.@username')
self.assertEqual(result.users, [u'username'])
-
+
def test_username_preceded_japanese(self):
result = self.parser.parse(u'あ@username')
self.assertEqual(result.html, u'あ@username')
self.assertEqual(result.users, [u'username'])
-
+
def test_username_followed_japanese(self):
result = self.parser.parse(u'@usernameの')
self.assertEqual(result.html, u'@usernameの')
self.assertEqual(result.users, [u'username'])
-
+
def test_username_surrounded_japanese(self):
result = self.parser.parse(u'あ@usernameの')
self.assertEqual(result.html, u'あ@usernameの')
self.assertEqual(result.users, [u'username'])
-
+
def test_username_followed_punctuation(self):
result = self.parser.parse(u'@username&^$%^')
self.assertEqual(result.html, u'@username&^$%^')
self.assertEqual(result.users, [u'username'])
-
+
def test_not_username_spaced(self):
result = self.parser.parse(u'@ username')
self.assertEqual(result.html, u'@ username')
self.assertEqual(result.users, [])
-
+
def test_username_beginning(self):
result = self.parser.parse(u'@username text')
self.assertEqual(result.html, u'@username text')
self.assertEqual(result.users, [u'username'])
-
+
def test_username_to_long(self):
result = self.parser.parse(u'@username9012345678901')
self.assertEqual(result.html, u'@username9012345678901')
self.assertEqual(result.users, [u'username901234567890'])
-
+
def test_username_full_at_sign(self):
result = self.parser.parse(u'@username')
self.assertEqual(result.html, u'@username')
self.assertEqual(result.users, [u'username'])
-
+
def test_username_trailing(self):
result = self.parser.parse(u'text @username')
self.assertEqual(result.html, u'text @username')
self.assertEqual(result.users, [u'username'])
-
+
# Replies
def test_username_reply_simple(self):
result = self.parser.parse(u'@username')
self.assertEqual(result.html, u'@username')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.reply, u'username')
-
+
def test_username_reply_whitespace(self):
result = self.parser.parse(u' @username')
self.assertEqual(result.html, u' @username')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.reply, u'username')
-
+
def test_username_reply_full(self):
result = self.parser.parse(u' @username')
self.assertEqual(result.html, u' @username')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.reply, u'username')
-
+
def test_username_non_reply(self):
result = self.parser.parse(u'test @username')
self.assertEqual(result.html, u'test @username')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.reply, None)
-
-
+
# List tests ---------------------------------------------------------------
# --------------------------------------------------------------------------
def test_list_preceeded(self):
result = self.parser.parse(u'text @username/list')
self.assertEqual(result.html, u'text @username/list')
self.assertEqual(result.lists, [(u'username', u'list')])
-
+
def test_list_beginning(self):
result = self.parser.parse(u'@username/list')
self.assertEqual(result.html, u'@username/list')
self.assertEqual(result.lists, [(u'username', u'list')])
-
+
def test_list_preceeded_punctuation(self):
result = self.parser.parse(u'.@username/list')
self.assertEqual(result.html, u'.@username/list')
self.assertEqual(result.lists, [(u'username', u'list')])
-
+
def test_list_followed_punctuation(self):
result = self.parser.parse(u'@username/list&^$%^')
self.assertEqual(result.html, u'@username/list&^$%^')
self.assertEqual(result.lists, [(u'username', u'list')])
-
+
def test_list_not_slash_space(self):
result = self.parser.parse(u'@username/ list')
self.assertEqual(result.html, u'@username/ list')
self.assertEqual(result.users, [u'username'])
self.assertEqual(result.lists, [])
-
- def test_list_beginning(self):
+
+ def test_list_beginning2(self):
result = self.parser.parse(u'@username/list')
self.assertEqual(result.html, u'@username/list')
self.assertEqual(result.lists, [(u'username', u'list')])
-
+
def test_list_not_empty_username(self):
result = self.parser.parse(u'text @/list')
self.assertEqual(result.html, u'text @/list')
self.assertEqual(result.lists, [])
-
+
def test_list_not_preceeded_letter(self):
result = self.parser.parse(u'meet@the/beach')
self.assertEqual(result.html, u'meet@the/beach')
self.assertEqual(result.lists, [])
-
+
def test_list_long_truncate(self):
result = self.parser.parse(u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A')
- self.assertEqual(result.html, u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A')
+ self.assertEqual(
+ result.html, u'@username/list5678901234567890123456789012345678901234567890123456789012345678901234567890A')
self.assertEqual(result.lists, [(u'username', u'list5678901234567890123456789012345678901234567890123456789012345678901234567890')])
-
+
def test_list_with_dash(self):
result = self.parser.parse(u'text @username/list-foo')
self.assertEqual(result.html, u'text @username/list-foo')
self.assertEqual(result.lists, [(u'username', u'list-foo')])
+class TWPTestsWithSpans(unittest.TestCase):
+
+ """Test ttp with re spans to extract character co-ords of matches"""
+ def setUp(self):
+ self.parser = ttp.Parser(include_spans=True)
+
+ def test_spans_in_tweets(self):
+ """Test some coca-cola tweets taken from twitter with spans"""
+ result = self.parser.parse(u'Coca-Cola Hits 50 Million Facebook Likes http://bit.ly/QlKOc7')
+ self.assertEqual(result.urls, [('http://bit.ly/QlKOc7', (41, 61))])
+
+ result = self.parser.parse(u' #ABillionReasonsToBelieveInAfrica ARISE MAG.FASHION WEEK NY! Tsemaye B,Maki Oh,Tiffany Amber, Ozwald.Showin NY reasons2beliv @CocaCola_NG', html=False)
+ self.assertEqual(result.urls, [])
+ self.assertEqual(result.tags, [(u'ABillionReasonsToBelieveInAfrica', (1, 34))])
+ self.assertEqual(result.users, [(u'CocaCola_NG', (126, 138))])
+
+ result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA', html=False)
+ self.assertEqual(result.urls, [(u'http://bit.ly/EANCAA', (95, 115))])
+ self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72))])
+ self.assertEqual(result.tags, [(u'GameOn', (76, 83)), (u'ad', (84, 87))])
+
+ def test_users_in_tweets(self):
+ result = self.parser.parse(u'Follow @CokeZero & Retweet for a chance to win @EASPORTS @EANCAAFootball 13 #GameOn #ad Rules: http://bit.ly/EANCAA @someone', html=False)
+ self.assertEqual(result.users, [(u'CokeZero', (7, 16)), (u'EASPORTS', (47, 56)), (u'EANCAAFootball', (57, 72)), (u'someone', (116, 124))])
+
+ def test_edge_cases(self):
+ """Some edge cases that upset the original version of ttp"""
+ result = self.parser.parse(u' @user', html=False)
+ self.assertEqual(result.users, [(u'user', (1, 6))])
+
+ result = self.parser.parse(u' #hash ', html=False)
+ self.assertEqual(result.tags, [(u'hash', (1, 6))])
+
+ result = self.parser.parse(u' http://some.com ', html=False)
+ self.assertEqual(result.urls, [(u'http://some.com', (1, 16))])
+
+
# Test it!
if __name__ == '__main__':
unittest.main()
+ # verbosity = 0 # set to 2 for verbose output
+ # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpansEdgeCases)
+ # unittest.TextTestRunner(verbosity=verbosity).run(suite)
+ # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTestsWithSpans)
+ # unittest.TextTestRunner(verbosity=verbosity).run(suite)
+ # suite = unittest.TestLoader().loadTestsFromTestCase(TWPTests)
+ # unittest.TextTestRunner(verbosity=verbosity).run(suite)
diff --git a/ttp.py b/ttp/ttp.py
similarity index 82%
rename from ttp.py
rename to ttp/ttp.py
index 27102a9..ac7c79e 100644
--- a/ttp.py
+++ b/ttp/ttp.py
@@ -13,14 +13,18 @@
# You should have received a copy of the GNU General Public License along with
# twitter-text-python. If not, see .
-# TODO create a setup.py
-
+# Forked by Ian Ozsvald:
+# https://github.com/ianozsvald/twitter-text-python
+# from:
+# https://github.com/BonsaiDen/twitter-text-python
# Tweet Parser and Formatter ---------------------------------------------------
# ------------------------------------------------------------------------------
import re
import urllib
+__version__ = "1.0.1.0"
+
# Some of this code has been translated from the twitter-text-java library:
#
AT_SIGNS = ur'[@\uff20]'
@@ -35,8 +39,8 @@
# Users
USERNAME_REGEX = re.compile(ur'\B' + AT_SIGNS + LIST_END_CHARS, re.IGNORECASE)
-REPLY_REGEX = re.compile(ur'^(?:' + SPACES + ur')*' + AT_SIGNS \
- + ur'([a-z0-9_]{1,20}).*', re.IGNORECASE)
+REPLY_REGEX = re.compile(ur'^(?:' + SPACES + ur')*' + AT_SIGNS
+ + ur'([a-z0-9_]{1,20}).*', re.IGNORECASE)
# Hashtags
HASHTAG_EXP = ur'(^|[^0-9A-Z&/]+)(#|\uff03)([0-9A-Z_]*[A-Z_]+[%s]*)' % UTF_CHARS
@@ -55,76 +59,78 @@
PATH_ENDING_CHARS = r'[%s\)=#/]' % UTF_CHARS
QUERY_ENDING_CHARS = '[a-z0-9_&=#]'
-URL_REGEX = re.compile('((%s)((https?://|www\\.)(%s)(\/%s*%s?)?(\?%s*%s)?))'
+URL_REGEX = re.compile('((%s)((https?://|www\\.)(%s)(\/(%s*%s)?)?(\?%s*%s)?))'
% (PRE_CHARS, DOMAIN_CHARS, PATH_CHARS,
PATH_ENDING_CHARS, QUERY_CHARS, QUERY_ENDING_CHARS),
- re.IGNORECASE)
-
+ re.IGNORECASE)
# Registered IANA one letter domains
IANA_ONE_LETTER_DOMAINS = ('x.com', 'x.org', 'z.com', 'q.net', 'q.com', 'i.net')
class ParseResult(object):
+
'''A class containing the results of a parsed Tweet.
-
+
Attributes:
- urls:
A list containing all the valid urls in the Tweet.
-
+
- users
A list containing all the valid usernames in the Tweet.
-
+
- reply
A string containing the username this tweet was a reply to.
This only matches a username at the beginning of the Tweet,
it may however be preceeded by whitespace.
Note: It's generally better to rely on the Tweet JSON/XML in order to
find out if it's a reply or not.
-
+
- lists
A list containing all the valid lists in the Tweet.
Each list item is a tuple in the format (username, listname).
-
+
- tags
A list containing all the valid tags in theTweet.
-
+
- html
A string containg formatted HTML.
To change the formatting sublcass twp.Parser and override the format_*
methods.
-
+
'''
-
+
def __init__(self, urls, users, reply, lists, tags, html):
- self.urls = list(set(urls)) if urls else [] #fixes dups
- self.users = list(set(users)) if users else []
- self.lists = list(set(lists)) if lists else []
- self.reply = list(set(reply)) if reply else []
- self.tags = list(set(tags)) if tags else []
+ self.urls = urls if urls else []
+ self.users = users if users else []
+ self.lists = lists if lists else []
+ self.reply = reply if reply else None
+ self.tags = tags if tags else []
self.html = html
class Parser(object):
+
'''A Tweet Parser'''
-
- def __init__(self, max_url_length=30):
+
+ def __init__(self, max_url_length=30, include_spans=False):
self._max_url_length = max_url_length
-
+ self._include_spans = include_spans
+
def parse(self, text, html=True):
'''Parse the text and return a ParseResult instance.'''
self._urls = []
self._users = []
self._lists = []
self._tags = []
-
+
reply = REPLY_REGEX.match(text)
reply = reply.groups(0)[0] if reply is not None else None
-
+
parsed_html = self._html(text) if html else self._text(text)
return ParseResult(self._urls, self._users, reply,
self._lists, self._tags, parsed_html)
-
+
def _text(self, text):
'''Parse a Tweet without generating HTML.'''
URL_REGEX.sub(self._parse_urls, text)
@@ -132,84 +138,95 @@ def _text(self, text):
LIST_REGEX.sub(self._parse_lists, text)
HASHTAG_REGEX.sub(self._parse_tags, text)
return None
-
+
def _html(self, text):
'''Parse a Tweet and generate HTML.'''
html = URL_REGEX.sub(self._parse_urls, text)
html = USERNAME_REGEX.sub(self._parse_users, html)
html = LIST_REGEX.sub(self._parse_lists, html)
return HASHTAG_REGEX.sub(self._parse_tags, html)
-
-
+
# Internal parser stuff ----------------------------------------------------
def _parse_urls(self, match):
'''Parse URLs.'''
-
+
mat = match.group(0)
-
+
# Fix a bug in the regex concerning www...com and www.-foo.com domains
# TODO fix this in the regex instead of working around it here
domain = match.group(5)
if domain[0] in '.-':
return mat
-
+
# Only allow IANA one letter domains that are actually registered
if len(domain) == 5 \
and domain[-4:].lower() in ('.com', '.org', '.net') \
and not domain.lower() in IANA_ONE_LETTER_DOMAINS:
-
+
return mat
-
+
# Check for urls without http(s)
pos = mat.find('http')
if pos != -1:
pre, url = mat[:pos], mat[pos:]
full_url = url
-
+
# Find the www and force http://
else:
pos = mat.lower().find('www')
pre, url = mat[:pos], mat[pos:]
full_url = 'http://%s' % url
-
- self._urls.append(url)
-
+
+ if self._include_spans:
+ span = match.span(0)
+ # add an offset if pre is e.g. ' '
+ span = (span[0] + len(pre), span[1])
+ self._urls.append((url, span))
+ else:
+ self._urls.append(url)
+
if self._html:
return '%s%s' % (pre, self.format_url(full_url,
- self._shorten_url(escape(url))))
-
+ self._shorten_url(escape(url))))
+
def _parse_users(self, match):
'''Parse usernames.'''
-
+
# Don't parse lists here
if match.group(2) is not None:
return match.group(0)
-
+
mat = match.group(0)
- self._users.append(mat[1:])
-
+ if self._include_spans:
+ self._users.append((mat[1:], match.span(0)))
+ else:
+ self._users.append(mat[1:])
+
if self._html:
return self.format_username(mat[0:1], mat[1:])
-
+
def _parse_lists(self, match):
'''Parse lists.'''
-
+
# Don't parse usernames here
if match.group(4) is None:
return match.group(0)
-
+
pre, at_char, user, list_name = match.groups()
list_name = list_name[1:]
- self._lists.append((user, list_name))
-
+ if self._include_spans:
+ self._lists.append((user, list_name, match.span(0)))
+ else:
+ self._lists.append((user, list_name))
+
if self._html:
return '%s%s' % (pre, self.format_list(at_char, user, list_name))
-
+
def _parse_tags(self, match):
'''Parse hashtags.'''
-
+
mat = match.group(0)
-
+
# Fix problems with the regex capturing stuff infront of the #
tag = None
for i in u'#\uff03':
@@ -217,45 +234,50 @@ def _parse_tags(self, match):
if pos != -1:
tag = i
break
-
+
pre, text = mat[:pos], mat[pos + 1:]
- self._tags.append(text)
-
+ if self._include_spans:
+ span = match.span(0)
+ # add an offset if pre is e.g. ' '
+ span = (span[0] + len(pre), span[1])
+ self._tags.append((text, span))
+ else:
+ self._tags.append(text)
+
if self._html:
return '%s%s' % (pre, self.format_tag(tag, text))
-
+
def _shorten_url(self, text):
'''Shorten a URL and make sure to not cut of html entities.'''
-
+
if len(text) > self._max_url_length and self._max_url_length != -1:
text = text[0:self._max_url_length - 3]
amp = text.rfind('&')
close = text.rfind(';')
if amp != -1 and (close == -1 or close < amp):
text = text[0:amp]
-
+
return text + '...'
-
+
else:
return text
-
-
+
# User defined formatters --------------------------------------------------
def format_tag(self, tag, text):
'''Return formatted HTML for a hashtag.'''
return '%s%s' \
- % (urllib.quote('#' + text.encode('utf-8')), tag, text)
-
+ % (urllib.quote('#' + text.encode('utf-8')), tag, text)
+
def format_username(self, at_char, user):
'''Return formatted HTML for a username.'''
return '%s%s' \
% (user, at_char, user)
-
+
def format_list(self, at_char, user, list_name):
'''Return formatted HTML for a list.'''
return '%s%s/%s' \
% (user, list_name, at_char, user, list_name)
-
+
def format_url(self, url, text):
'''Return formatted HTML for a url.'''
return '%s' % (escape(url), text)
@@ -267,4 +289,3 @@ def escape(text):
return ''.join({'&': '&', '"': '"',
'\'': ''', '>': '>',
'<': '<'}.get(c, c) for c in text)
-
diff --git a/ttp/utils.py b/ttp/utils.py
new file mode 100644
index 0000000..2c3d822
--- /dev/null
+++ b/ttp/utils.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""Unwind short-links e.g. bit.ly, t.co etc to their canonical links"""
+import requests
+
+
+def follow_shortlinks(shortlinks):
+ """Follow redirects in list of shortlinks, return dict of resulting URLs"""
+ links_followed = {}
+ for shortlink in shortlinks:
+ url = shortlink
+ request_result = requests.get(url)
+ redirect_history = request_result.history
+ # history might look like:
+ # (, )
+ # where each response object has a URL
+ all_urls = []
+ for redirect in redirect_history:
+ all_urls.append(redirect.url)
+ # append the final URL that we finish with
+ all_urls.append(request_result.url)
+ links_followed[shortlink] = all_urls
+ return links_followed
+
+
+if __name__ == "__main__":
+ shortlinks = ['http://t.co/8o0z9BbEMu', u'http://bbc.in/16dClPF']
+ print follow_shortlinks(shortlinks)