phenny-1/modules/etymology.py

#!/usr/bin/env python
"""
etymology.py - Phenny Etymology Module
Copyright 2007-9, Sean B. Palmer, inamidst.com
Licensed under the Eiffel Forum License 2.

http://inamidst.com/phenny/
"""

import re
import urllib.request
import web
from tools import deprecated

etysite = 'http://www.etymonline.com/index.php?'
etyuri = etysite + 'allowed_in_frame=0&term=%s'
etysearch = etysite + 'allowed_in_frame=0&search=%s'

r_definition = re.compile(r'(?ims)<dd[^>]*>.*?</dd>')
r_tag = re.compile(r'<(?!!)[^>]+>')
r_whitespace = re.compile(r'[\t\r\n ]+')

class Grab(urllib.request.URLopener): 
    def __init__(self, *args): 
        self.version = 'Mozilla/5.0 (Phenny)'
        urllib.URLopener.__init__(self, *args)
    def http_error_default(self, url, fp, errcode, errmsg, headers): 
        return urllib.addinfourl(fp, [headers, errcode], "http:" + url)

abbrs = [
    'cf', 'lit', 'etc', 'Ger', 'Du', 'Skt', 'Rus', 'Eng', 'Amer.Eng', 'Sp', 
    'Fr', 'N', 'E', 'S', 'W', 'L', 'Gen', 'J.C', 'dial', 'Gk', 
    '19c', '18c', '17c', '16c', 'St', 'Capt', 'obs', 'Jan', 'Feb', 'Mar', 
    'Apr', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'c', 'tr', 'e', 'g'
]
t_sentence = r'^.*?(?<!%s)(?:\.(?= [A-Z0-9]|\Z)|\Z)'
r_sentence = re.compile(t_sentence % ')(?<!'.join(abbrs))

def unescape(s): 
    s = s.replace('&gt;', '>')
    s = s.replace('&lt;', '<')
    s = s.replace('&amp;', '&')
    return s

def text(html): 
    html = r_tag.sub('', html)
    html = r_whitespace.sub(' ', html)
    return unescape(html).strip()

def etymology(word): 
    # @@ <nsh> sbp, would it be possible to have a flag for .ety to get 2nd/etc
    # entries? - http://swhack.com/logs/2006-07-19#T15-05-29

    if len(word) > 25: 
        raise ValueError("Word too long: %s[...]" % word[:10])
    word = {'axe': 'ax/axe'}.get(word, word)

    grab = urllib.request._urlopener
    urllib.request._urlopener = Grab()
    urllib.request._urlopener.addheader("Referer", "http://www.etymonline.com/")
    bytes = web.get(etyuri % web.quote(word))
    urllib.request._urlopener = grab
    definitions = r_definition.findall(bytes)

    if not definitions: 
        return None

    defn = text(definitions[0])
    m = r_sentence.match(defn)
    if not m: 
        return None
    sentence = m.group(0)

    try: 
        sentence = unicode(sentence, 'iso-8859-1')
        sentence = sentence.encode('utf-8')
    except: pass
    sentence = web.decode(sentence)

    maxlength = 275
    if len(sentence) > maxlength: 
        sentence = sentence[:maxlength]
        words = sentence[:-5].split(' ')
        words.pop()
        sentence = ' '.join(words) + ' [...]'

    sentence = '"' + sentence.replace('"', "'") + '"'
    return sentence + ' - etymonline.com'

@deprecated
def f_etymology(self, origin, match, args): 
    word = match.group(2)

    try: result = etymology(word.encode('iso-8859-1'))
    except IOError: 
        msg = "Can't connect to etymonline.com (%s)" % (etyuri % word)
        self.msg(origin.sender, msg)
        return
    except AttributeError: 
        result = None

    if result is not None: 
        self.msg(origin.sender, result)
    else: 
        uri = etysearch % word
        msg = 'Can\'t find the etymology for "%s". Try %s' % (word, uri)
        self.msg(origin.sender, msg)
# @@ Cf. http://swhack.com/logs/2006-01-04#T01-50-22
f_etymology.rule = (['ety'], r"(.+?)$")
f_etymology.thread = True
f_etymology.priority = 'high'

if __name__=="__main__": 
    import sys
    print(etymology(sys.argv[1]))
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`#!/usr/bin/env python`
			`"""`
			`etymology.py - Phenny Etymology Module`
Updated some of the copyright dates. 2009-06-07 05:08:49 -04:00			`Copyright 2007-9, Sean B. Palmer, inamidst.com`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`Licensed under the Eiffel Forum License 2.`

			`http://inamidst.com/phenny/`
			`"""`

			`import re`
Merge branch 'master' of https://github.com/sbp/phenny 2012-03-10 17:14:28 -05:00			`import urllib.request`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`import web`
			`from tools import deprecated`

Updated etymology interface 2012-02-26 19:10:33 -05:00			`etysite = 'http://www.etymonline.com/index.php?'`
			`etyuri = etysite + 'allowed_in_frame=0&term=%s'`
			`etysearch = etysite + 'allowed_in_frame=0&search=%s'`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00
			`r_definition = re.compile(r'(?ims)<dd[^>]>.?</dd>')`
			`r_tag = re.compile(r'<(?!!)[^>]+>')`
			`r_whitespace = re.compile(r'[\t\r\n ]+')`

Merge branch 'master' of https://github.com/sbp/phenny 2012-03-10 17:14:28 -05:00			`class Grab(urllib.request.URLopener):`
			`def __init__(self, *args):`
			`self.version = 'Mozilla/5.0 (Phenny)'`
			`urllib.URLopener.__init__(self, *args)`
			`def http_error_default(self, url, fp, errcode, errmsg, headers):`
			`return urllib.addinfourl(fp, [headers, errcode], "http:" + url)`
Updated etymology interface 2012-02-26 19:10:33 -05:00
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`abbrs = [`
module formatting tweaks; tabs -> spaces and more 2012-01-03 14:09:34 -05:00			`'cf', 'lit', 'etc', 'Ger', 'Du', 'Skt', 'Rus', 'Eng', 'Amer.Eng', 'Sp',`
			`'Fr', 'N', 'E', 'S', 'W', 'L', 'Gen', 'J.C', 'dial', 'Gk',`
			`'19c', '18c', '17c', '16c', 'St', 'Capt', 'obs', 'Jan', 'Feb', 'Mar',`
			`'Apr', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'c', 'tr', 'e', 'g'`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`]`
			`t_sentence = r'^.*?(?<!%s)(?:\.(?= [A-Z0-9]\|\Z)\|\Z)'`
			`r_sentence = re.compile(t_sentence % ')(?<!'.join(abbrs))`

			`def unescape(s):`
module formatting tweaks; tabs -> spaces and more 2012-01-03 14:09:34 -05:00			`s = s.replace('>', '>')`
			`s = s.replace('<', '<')`
			`s = s.replace('&', '&')`
			`return s`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00
			`def text(html):`
module formatting tweaks; tabs -> spaces and more 2012-01-03 14:09:34 -05:00			`html = r_tag.sub('', html)`
			`html = r_whitespace.sub(' ', html)`
			`return unescape(html).strip()`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00
			`def etymology(word):`
module formatting tweaks; tabs -> spaces and more 2012-01-03 14:09:34 -05:00			`# @@ <nsh> sbp, would it be possible to have a flag for .ety to get 2nd/etc`
			`# entries? - http://swhack.com/logs/2006-07-19#T15-05-29`
Merge branch 'master' of https://github.com/sbp/phenny 2012-03-10 17:14:28 -05:00
module formatting tweaks; tabs -> spaces and more 2012-01-03 14:09:34 -05:00			`if len(word) > 25:`
			`raise ValueError("Word too long: %s[...]" % word[:10])`
			`word = {'axe': 'ax/axe'}.get(word, word)`

Merge branch 'master' of https://github.com/sbp/phenny 2012-03-10 17:14:28 -05:00			`grab = urllib.request._urlopener`
			`urllib.request._urlopener = Grab()`
			`urllib.request._urlopener.addheader("Referer", "http://www.etymonline.com/")`
			`bytes = web.get(etyuri % web.quote(word))`
			`urllib.request._urlopener = grab`
module formatting tweaks; tabs -> spaces and more 2012-01-03 14:09:34 -05:00			`definitions = r_definition.findall(bytes)`

			`if not definitions:`
			`return None`

			`defn = text(definitions[0])`
			`m = r_sentence.match(defn)`
			`if not m:`
			`return None`
			`sentence = m.group(0)`

			`try:`
Merge branch 'master' of https://github.com/sbp/phenny 2012-03-10 17:14:28 -05:00			`sentence = unicode(sentence, 'iso-8859-1')`
			`sentence = sentence.encode('utf-8')`
module formatting tweaks; tabs -> spaces and more 2012-01-03 14:09:34 -05:00			`except: pass`
Merge branch 'master' of https://github.com/sbp/phenny 2012-03-10 17:14:28 -05:00			`sentence = web.decode(sentence)`
module formatting tweaks; tabs -> spaces and more 2012-01-03 14:09:34 -05:00
			`maxlength = 275`
			`if len(sentence) > maxlength:`
			`sentence = sentence[:maxlength]`
			`words = sentence[:-5].split(' ')`
			`words.pop()`
			`sentence = ' '.join(words) + ' [...]'`

			`sentence = '"' + sentence.replace('"', "'") + '"'`
Merge branch 'master' of https://github.com/sbp/phenny 2012-03-10 17:14:28 -05:00			`return sentence + ' - etymonline.com'`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00
			`@deprecated`
			`def f_etymology(self, origin, match, args):`
module formatting tweaks; tabs -> spaces and more 2012-01-03 14:09:34 -05:00			`word = match.group(2)`

			`try: result = etymology(word.encode('iso-8859-1'))`
			`except IOError:`
			`msg = "Can't connect to etymonline.com (%s)" % (etyuri % word)`
			`self.msg(origin.sender, msg)`
			`return`
			`except AttributeError:`
			`result = None`

			`if result is not None:`
			`self.msg(origin.sender, result)`
			`else:`
			`uri = etysearch % word`
			`msg = 'Can\'t find the etymology for "%s". Try %s' % (word, uri)`
			`self.msg(origin.sender, msg)`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`# @@ Cf. http://swhack.com/logs/2006-01-04#T01-50-22`
Allow latin1 searches of etymonline, which doesn't seem to support utf-8. 2011-06-17 11:56:49 -04:00			`f_etymology.rule = (['ety'], r"(.+?)$")`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`f_etymology.thread = True`
			`f_etymology.priority = 'high'`

			`if __name__=="__main__":`
module formatting tweaks; tabs -> spaces and more 2012-01-03 14:09:34 -05:00			`import sys`
			`print(etymology(sys.argv[1]))`