refactor mediawiki modules into unified library

2012-06-13 21:58:31 -07:00
parent 63c6adb316
commit ec32741826
5 changed files with 88 additions and 317 deletions
--- a/modules/archwiki.py
+++ b/modules/archwiki.py
@@ -11,74 +11,34 @@ author: mutantmonkey <mutantmonkey@mutantmonkey.in>
 """
 import re, urllib.request, urllib.parse, urllib.error
-import web
+import wiki
 import json
-wikiapi = 'https://wiki.archlinux.org/api.php?action=query&list=search&srsearch=%s&limit=1&prop=snippet&format=json'
+wikiapi = 'https://wiki.archlinux.org/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json'
-wikiuri = 'https://wiki.archlinux.org/index.php/%s'
+wikiuri = 'https://wiki.archlinux.org/index.php/{0}'
 wikisearch = 'https://wiki.archlinux.org/index.php/Special:Search?' \
-                          + 'search=%s&fulltext=Search'
+                          + 'search={0}&fulltext=Search'
 r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
 r_content = re.compile(r'(?ims)</p>\n</div>.*?<!-- end content -->')
 r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
 r_tag = re.compile(r'<(?!!)[^>]+>')
 r_whitespace = re.compile(r'[\t\r\n ]+')
 r_redirect = re.compile(
    r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
 )
 abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs', 
            'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit', 
            'syn', 'transl', 'sess', 'fl', 'Op'] \
    + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
    + list('abcdefghijklmnopqrstuvwxyz')
 t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)'
 r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs))
 def unescape(s): 
    s = s.replace('&gt;', '>')
    s = s.replace('&lt;', '<')
    s = s.replace('&amp;', '&')
    s = s.replace('&#160;', ' ')
    return s
 def text(html): 
    html = r_tag.sub('', html)
    html = r_whitespace.sub(' ', html)
    return unescape(html).strip()
 def archwiki(term, last=False): 
    global wikiapi, wikiuri
    url = wikiapi % term
    bytes = web.get(url)
    result = json.loads(bytes)
    result = result['query']['search']
    if len(result) <= 0:
        return None
    term = result[0]['title']
    term = term.replace(' ', '_')
    snippet = text(result[0]['snippet'])
    return "%s - %s" % (snippet, wikiuri % term)
 def awik(phenny, input): 
    origterm = input.groups()[1]
    if not origterm: 
        return phenny.say('Perhaps you meant ".awik dwm"?')
    origterm = origterm
    term = urllib.parse.unquote(origterm)
    term = term[0].upper() + term[1:]
    term = term.replace(' ', '_')
-    try: result = archwiki(term)
+    w = wiki.Wiki(wikiapi, wikiuri, wikisearch)
    try:
        result = w.search(term)
    except IOError: 
-        error = "Can't connect to wiki.archlinux.org (%s)" % (wikiuri % term)
+        error = "Can't connect to wiki.archlinux.org ({0})".format(wikiuri.format(term))
        return phenny.say(error)
    if result is not None: 
        phenny.say(result)
-    else: phenny.say('Can\'t find anything in the ArchWiki for "%s".' % origterm)
+    else:
        phenny.say('Can\'t find anything in the ArchWiki for "{0}".'.format(origterm))
 awik.commands = ['awik']
 awik.priority = 'high'
--- a/modules/uncyclopedia.py
+++ b/modules/uncyclopedia.py
@@ -1,165 +0,0 @@
 #!/usr/bin/env python
 """
 uncyclopedia.py - Phenny Uncyclopedia Module
 Copyright 2008-9, Sean B. Palmer, inamidst.com
 Licensed under the Eiffel Forum License 2.
 http://inamidst.com/phenny/
 modified from Wikipedia module
 author: mutantmonkey <mutantmonkey@mutantmonkey.in>
 """
 import re, urllib.request, urllib.parse, urllib.error
 import web
 wikiuri = 'http://uncyclopedia.wikia.com/wiki/%s'
 wikisearch = 'http://uncyclopedia.wikia.com/wiki/Special:Search?' \
                          + 'search=%s&fulltext=Search'
 r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
 r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
 r_tag = re.compile(r'<(?!!)[^>]+>')
 r_whitespace = re.compile(r'[\t\r\n ]+')
 r_redirect = re.compile(
    r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
 )
 abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs', 
            'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit', 
            'syn', 'transl', 'sess', 'fl', 'Op'] \
    + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
    + list('abcdefghijklmnopqrstuvwxyz')
 t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)'
 r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs))
 def unescape(s): 
    s = s.replace('&gt;', '>')
    s = s.replace('&lt;', '<')
    s = s.replace('&amp;', '&')
    s = s.replace('&#160;', ' ')
    return s
 def text(html): 
    html = r_tag.sub('', html)
    html = r_whitespace.sub(' ', html)
    return unescape(html).strip()
 def search(term): 
    try: from . import search
    except ImportError as e: 
        print(e)
        return term
    if not isinstance(term, str): 
        term = term.decode('utf-8')
    term = term.replace('_', ' ')
    try: uri = search.result('site:uncyclopedia.wikia.com %s' % term)
    except IndexError: return term
    if uri: 
        return uri[len('http://uncyclopedia.wikia.com/wiki/'):]
    else: return term
 def uncyclopedia(term, last=False): 
    global wikiuri
    if not '%' in term: 
        if isinstance(term, str): 
            t = term
        else: t = term
        q = urllib.parse.quote(t)
        u = wikiuri % q
        bytes = web.get(u)
    else: bytes = web.get(wikiuri % term)
    bytes = r_tr.sub('', bytes)
    if not last: 
        r = r_redirect.search(bytes[:4096])
        if r: 
            term = urllib.parse.unquote(r.group(1))
            return uncyclopedia(term, last=True)
    paragraphs = r_paragraph.findall(bytes)
    if not paragraphs: 
        if not last: 
            term = search(term)
            return uncyclopedia(term, last=True)
        return None
    # Pre-process
    paragraphs = [para for para in paragraphs 
                      if (para and 'technical limitations' not in para 
                                  and 'window.showTocToggle' not in para 
                                  and 'Deletion_policy' not in para 
                                  and 'Template:AfD_footer' not in para 
                                  and not (para.startswith('<p><i>') and 
                                              para.endswith('</i></p>'))
                                  and not 'disambiguation)"' in para) 
                                  and not '(images and media)' in para
                                  and not 'This article contains a' in para 
                                  and not 'id="coordinates"' in para
                                  and not 'class="thumb' in para
                                  and not 'There is currently no text in this page.' in para]
                                  # and not 'style="display:none"' in para]
    for i, para in enumerate(paragraphs): 
        para = para.replace('<sup>', '|')
        para = para.replace('</sup>', '|')
        paragraphs[i] = text(para).strip()
    # Post-process
    paragraphs = [para for para in paragraphs if 
                      (para and not (para.endswith(':') and len(para) < 150))]
    para = text(paragraphs[0])
    m = r_sentence.match(para)
    if not m: 
        if not last: 
            term = search(term)
            return uncyclopedia(term, last=True)
        return None
    sentence = m.group(0)
    maxlength = 275
    if len(sentence) > maxlength: 
        sentence = sentence[:maxlength]
        words = sentence[:-5].split(' ')
        words.pop()
        sentence = ' '.join(words) + ' [...]'
    if (('using the Article Wizard if you wish' in sentence)
     or ('or add a request for it' in sentence)): 
        if not last: 
            term = search(term)
            return uncyclopedia(term, last=True)
        return None
    sentence = '"' + sentence.replace('"', "'") + '"'
    return sentence + ' - ' + (wikiuri % term)
 def uncyc(phenny, input): 
    origterm = input.groups()[1]
    if not origterm: 
        return phenny.say('Perhaps you meant ".uncyc Zen"?')
    origterm = origterm
    term = urllib.parse.unquote(origterm)
    term = term[0].upper() + term[1:]
    term = term.replace(' ', '_')
    try: result = uncyclopedia(term)
    except IOError: 
        error = "Can't connect to uncyclopedia.wikia.com (%s)" % (wikiuri % term)
        return phenny.say(error)
    if result is not None: 
        phenny.say(result)
    else: phenny.say('Can\'t find anything in Uncyclopedia for "%s".' % origterm)
 uncyc.commands = ['uncyc']
 uncyc.priority = 'high'
 if __name__ == '__main__': 
    print(__doc__.strip())
--- a/modules/vtluugwiki.py
+++ b/modules/vtluugwiki.py
@@ -11,73 +11,34 @@ author: mutantmonkey <mutantmonkey@mutantmonkey.in>
 """
 import re, urllib.request, urllib.parse, urllib.error
-import web
+import wiki
 import json
-wikiapi = 'https://vtluug.org/w/api.php?action=query&list=search&srsearch=%s&limit=1&prop=snippet&format=json'
+wikiapi = 'https://vtluug.org/w/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json'
-wikiuri = 'https://vtluug.org/wiki/%s'
+wikiuri = 'https://vtluug.org/wiki/{0}'
 wikisearch = 'https://vtluug.org/wiki/Special:Search?' \
-                          + 'search=%s&fulltext=Search'
+                          + 'search={0}&fulltext=Search'
 r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
 r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
 r_tag = re.compile(r'<(?!!)[^>]+>')
 r_whitespace = re.compile(r'[\t\r\n ]+')
 r_redirect = re.compile(
    r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
 )
 abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs', 
            'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit', 
            'syn', 'transl', 'sess', 'fl', 'Op'] \
    + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
    + list('abcdefghijklmnopqrstuvwxyz')
 t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)'
 r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs))
 def unescape(s): 
    s = s.replace('&gt;', '>')
    s = s.replace('&lt;', '<')
    s = s.replace('&amp;', '&')
    s = s.replace('&#160;', ' ')
    return s
 def text(html): 
    html = r_tag.sub('', html)
    html = r_whitespace.sub(' ', html)
    return unescape(html).strip()
 def vtluugwiki(term, last=False): 
    global wikiapi, wikiuri
    url = wikiapi % term
    bytes = web.get(url)
    result = json.loads(bytes)
    result = result['query']['search']
    if len(result) <= 0:
        return None
    term = result[0]['title']
    term = term.replace(' ', '_')
    snippet = text(result[0]['snippet'])
    return "%s - %s" % (snippet, wikiuri % term)
 def vtluug(phenny, input): 
    origterm = input.groups()[1]
    if not origterm: 
-        return phenny.say('Perhaps you meant ".vtluug Zen"?')
+        return phenny.say('Perhaps you meant ".vtluug VT-Wireless"?')
    origterm = origterm
    term = urllib.parse.unquote(origterm)
    term = term[0].upper() + term[1:]
    term = term.replace(' ', '_')
-    try: result = vtluugwiki(term)
+    w = wiki.Wiki(wikiapi, wikiuri, wikisearch)
    try:
        result = w.search(term)
    except IOError: 
-        error = "Can't connect to vtluug.org (%s)" % (wikiuri % term)
+        error = "Can't connect to vtluug.org ({0})".format(wikiuri.format(term))
        return phenny.say(error)
    if result is not None: 
        phenny.say(result)
-    else: phenny.say('Can\'t find anything in the VTLUUG Wiki for "%s".' % origterm)
+    else:
        phenny.say('Can\'t find anything in the VTLUUG Wiki for "{0}".'.format(origterm))
 vtluug.commands = ['vtluug']
 vtluug.priority = 'high'
--- a/modules/wikipedia.py
+++ b/modules/wikipedia.py
@@ -8,73 +8,34 @@ http://inamidst.com/phenny/
 """
 import re, urllib.request, urllib.parse, urllib.error, gzip, io
-import web
+import wiki
 import json
-wikiapi = 'http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&limit=1&prop=snippet&format=json'
+wikiapi = 'http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json'
-wikiuri = 'http://en.wikipedia.org/wiki/%s'
+wikiuri = 'http://en.wikipedia.org/wiki/{0}'
 wikisearch = 'http://en.wikipedia.org/wiki/Special:Search?' \
-                          + 'search=%s&fulltext=Search'
+                          + 'search={0}&fulltext=Search'
 r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
 r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
 r_tag = re.compile(r'<(?!!)[^>]+>')
 r_whitespace = re.compile(r'[\t\r\n ]+')
 r_redirect = re.compile(
    r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
 )
 abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs', 
         'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit', 
         'syn', 'transl', 'sess', 'fl', 'Op', 'Dec', 'Brig', 'Gen'] \
   + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
   + list('abcdefghijklmnopqrstuvwxyz')
 t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)'
 r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs))
 def unescape(s): 
    s = s.replace('&gt;', '>')
    s = s.replace('&lt;', '<')
    s = s.replace('&amp;', '&')
    s = s.replace('&#160;', ' ')
    return s
 def text(html): 
    html = r_tag.sub('', html)
    html = r_whitespace.sub(' ', html)
    return unescape(html).strip()
 def wikipedia(term, last=False): 
    global wikiapi, wikiuri
    url = wikiapi % term
    bytes = web.get(url)
    result = json.loads(bytes)
    result = result['query']['search']
    if len(result) <= 0:
        return None
    term = result[0]['title']
    term = term.replace(' ', '_')
    snippet = text(result[0]['snippet'])
    return "%s - %s" % (snippet, wikiuri % term)
 def wik(phenny, input): 
    origterm = input.groups()[1]
    if not origterm: 
        return phenny.say('Perhaps you meant ".wik Zen"?')
    origterm = origterm
    term = urllib.parse.unquote(origterm)
    term = term[0].upper() + term[1:]
    term = term.replace(' ', '_')
-    try: result = wikipedia(term)
+    w = wiki.Wiki(wikiapi, wikiuri, wikisearch)
    try:
        result = w.search(term)
    except IOError: 
-        error = "Can't connect to en.wikipedia.org (%s)" % (wikiuri % term)
+        error = "Can't connect to en.wikipedia.org ({0})".format(wikiuri.format(term))
        return phenny.say(error)
    if result is not None: 
        phenny.say(result)
-    else: phenny.say('Can\'t find anything in Wikipedia for "%s".' % origterm)
+    else:
        phenny.say('Can\'t find anything in Wikipedia for "{0}".'.format(origterm))
 wik.commands = ['wik']
 wik.priority = 'high'
--- a/wiki.py
+++ b/wiki.py
@@ -0,0 +1,54 @@
 import json
 import re
 import web
 r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
 r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
 r_tag = re.compile(r'<(?!!)[^>]+>')
 r_whitespace = re.compile(r'[\t\r\n ]+')
 r_redirect = re.compile(
    r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
 )
 abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs', 
         'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit', 
         'syn', 'transl', 'sess', 'fl', 'Op', 'Dec', 'Brig', 'Gen'] \
   + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
   + list('abcdefghijklmnopqrstuvwxyz')
 t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)'
 r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs))
 class Wiki(object):
    def __init__(self, api, url, searchurl=""):
        self.api = api
        self.url = url
        self.searchurl = searchurl
    @staticmethod
    def unescape(s): 
        s = s.replace('&gt;', '>')
        s = s.replace('&lt;', '<')
        s = s.replace('&amp;', '&')
        s = s.replace('&#160;', ' ')
        return s
    @staticmethod
    def text(html): 
        html = r_tag.sub('', html)
        html = r_whitespace.sub(' ', html)
        return Wiki.unescape(html).strip()
    def search(self, term, last=False):
        url = self.api.format(term)
        bytes = web.get(url)
        result = json.loads(bytes)
        result = result['query']['search']
        if len(result) <= 0:
            return None
        term = result[0]['title']
        term = term.replace(' ', '_')
        snippet = self.text(result[0]['snippet'])
        return "{0} - {1}".format(snippet, self.url.format(term))