refactor mediawiki modules into unified library

2012-06-13 21:58:31 -07:00
parent 63c6adb316
commit ec32741826
5 changed files with 88 additions and 317 deletions
@@ -11,74 +11,34 @@ author: mutantmonkey <mutantmonkey@mutantmonkey.in>
 """

 import re, urllib.request, urllib.parse, urllib.error
-import web
-import json
+import wiki

-wikiapi = 'https://wiki.archlinux.org/api.php?action=query&list=search&srsearch=%s&limit=1&prop=snippet&format=json'
-wikiuri = 'https://wiki.archlinux.org/index.php/%s'
+wikiapi = 'https://wiki.archlinux.org/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json'
+wikiuri = 'https://wiki.archlinux.org/index.php/{0}'
 wikisearch = 'https://wiki.archlinux.org/index.php/Special:Search?' \
-                          + 'search=%s&fulltext=Search'
-
-r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
-r_content = re.compile(r'(?ims)</p>\n</div>.*?<!-- end content -->')
-r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
-r_tag = re.compile(r'<(?!!)[^>]+>')
-r_whitespace = re.compile(r'[\t\r\n ]+')
-r_redirect = re.compile(
-    r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
-)
-
-abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs', 
-            'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit', 
-            'syn', 'transl', 'sess', 'fl', 'Op'] \
-    + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
-    + list('abcdefghijklmnopqrstuvwxyz')
-t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)'
-r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs))
-
-def unescape(s): 
-    s = s.replace('&gt;', '>')
-    s = s.replace('&lt;', '<')
-    s = s.replace('&amp;', '&')
-    s = s.replace('&#160;', ' ')
-    return s
-
-def text(html): 
-    html = r_tag.sub('', html)
-    html = r_whitespace.sub(' ', html)
-    return unescape(html).strip()
-
-def archwiki(term, last=False): 
-    global wikiapi, wikiuri
-    url = wikiapi % term
-    bytes = web.get(url)
-    result = json.loads(bytes)
-    result = result['query']['search']
-    if len(result) <= 0:
-        return None
-    term = result[0]['title']
-    term = term.replace(' ', '_')
-    snippet = text(result[0]['snippet'])
-    return "%s - %s" % (snippet, wikiuri % term)
+                          + 'search={0}&fulltext=Search'

 def awik(phenny, input): 
    origterm = input.groups()[1]
    if not origterm: 
        return phenny.say('Perhaps you meant ".awik dwm"?')
-    origterm = origterm

    term = urllib.parse.unquote(origterm)
    term = term[0].upper() + term[1:]
    term = term.replace(' ', '_')

-    try: result = archwiki(term)
+    w = wiki.Wiki(wikiapi, wikiuri, wikisearch)
+
+    try:
+        result = w.search(term)
    except IOError: 
-        error = "Can't connect to wiki.archlinux.org (%s)" % (wikiuri % term)
+        error = "Can't connect to wiki.archlinux.org ({0})".format(wikiuri.format(term))
        return phenny.say(error)

    if result is not None: 
        phenny.say(result)
-    else: phenny.say('Can\'t find anything in the ArchWiki for "%s".' % origterm)
+    else:
+        phenny.say('Can\'t find anything in the ArchWiki for "{0}".'.format(origterm))

 awik.commands = ['awik']
 awik.priority = 'high'
@@ -1,165 +0,0 @@
-#!/usr/bin/env python
-"""
-uncyclopedia.py - Phenny Uncyclopedia Module
-Copyright 2008-9, Sean B. Palmer, inamidst.com
-Licensed under the Eiffel Forum License 2.
-
-http://inamidst.com/phenny/
-
-modified from Wikipedia module
-author: mutantmonkey <mutantmonkey@mutantmonkey.in>
-"""
-
-import re, urllib.request, urllib.parse, urllib.error
-import web
-
-wikiuri = 'http://uncyclopedia.wikia.com/wiki/%s'
-wikisearch = 'http://uncyclopedia.wikia.com/wiki/Special:Search?' \
-                          + 'search=%s&fulltext=Search'
-
-r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
-r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
-r_tag = re.compile(r'<(?!!)[^>]+>')
-r_whitespace = re.compile(r'[\t\r\n ]+')
-r_redirect = re.compile(
-    r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
-)
-
-abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs', 
-            'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit', 
-            'syn', 'transl', 'sess', 'fl', 'Op'] \
-    + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
-    + list('abcdefghijklmnopqrstuvwxyz')
-t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)'
-r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs))
-
-def unescape(s): 
-    s = s.replace('&gt;', '>')
-    s = s.replace('&lt;', '<')
-    s = s.replace('&amp;', '&')
-    s = s.replace('&#160;', ' ')
-    return s
-
-def text(html): 
-    html = r_tag.sub('', html)
-    html = r_whitespace.sub(' ', html)
-    return unescape(html).strip()
-
-def search(term): 
-    try: from . import search
-    except ImportError as e: 
-        print(e)
-        return term
-
-    if not isinstance(term, str): 
-        term = term.decode('utf-8')
-
-    term = term.replace('_', ' ')
-    try: uri = search.result('site:uncyclopedia.wikia.com %s' % term)
-    except IndexError: return term
-    if uri: 
-        return uri[len('http://uncyclopedia.wikia.com/wiki/'):]
-    else: return term
-
-def uncyclopedia(term, last=False): 
-    global wikiuri
-    if not '%' in term: 
-        if isinstance(term, str): 
-            t = term
-        else: t = term
-        q = urllib.parse.quote(t)
-        u = wikiuri % q
-        bytes = web.get(u)
-    else: bytes = web.get(wikiuri % term)
-    bytes = r_tr.sub('', bytes)
-
-    if not last: 
-        r = r_redirect.search(bytes[:4096])
-        if r: 
-            term = urllib.parse.unquote(r.group(1))
-            return uncyclopedia(term, last=True)
-
-    paragraphs = r_paragraph.findall(bytes)
-
-    if not paragraphs: 
-        if not last: 
-            term = search(term)
-            return uncyclopedia(term, last=True)
-        return None
-
-    # Pre-process
-    paragraphs = [para for para in paragraphs 
-                      if (para and 'technical limitations' not in para 
-                                  and 'window.showTocToggle' not in para 
-                                  and 'Deletion_policy' not in para 
-                                  and 'Template:AfD_footer' not in para 
-                                  and not (para.startswith('<p><i>') and 
-                                              para.endswith('</i></p>'))
-                                  and not 'disambiguation)"' in para) 
-                                  and not '(images and media)' in para
-                                  and not 'This article contains a' in para 
-                                  and not 'id="coordinates"' in para
-                                  and not 'class="thumb' in para
-                                  and not 'There is currently no text in this page.' in para]
-                                  # and not 'style="display:none"' in para]
-
-    for i, para in enumerate(paragraphs): 
-        para = para.replace('<sup>', '|')
-        para = para.replace('</sup>', '|')
-        paragraphs[i] = text(para).strip()
-
-    # Post-process
-    paragraphs = [para for para in paragraphs if 
-                      (para and not (para.endswith(':') and len(para) < 150))]
-
-    para = text(paragraphs[0])
-    m = r_sentence.match(para)
-
-    if not m: 
-        if not last: 
-            term = search(term)
-            return uncyclopedia(term, last=True)
-        return None
-    sentence = m.group(0)
-
-    maxlength = 275
-    if len(sentence) > maxlength: 
-        sentence = sentence[:maxlength]
-        words = sentence[:-5].split(' ')
-        words.pop()
-        sentence = ' '.join(words) + ' [...]'
-
-    if (('using the Article Wizard if you wish' in sentence)
-     or ('or add a request for it' in sentence)): 
-        if not last: 
-            term = search(term)
-            return uncyclopedia(term, last=True)
-        return None
-
-    sentence = '"' + sentence.replace('"', "'") + '"'
-    return sentence + ' - ' + (wikiuri % term)
-
-def uncyc(phenny, input): 
-    origterm = input.groups()[1]
-    if not origterm: 
-        return phenny.say('Perhaps you meant ".uncyc Zen"?')
-    origterm = origterm
-
-    term = urllib.parse.unquote(origterm)
-    term = term[0].upper() + term[1:]
-    term = term.replace(' ', '_')
-
-    try: result = uncyclopedia(term)
-    except IOError: 
-        error = "Can't connect to uncyclopedia.wikia.com (%s)" % (wikiuri % term)
-        return phenny.say(error)
-
-    if result is not None: 
-        phenny.say(result)
-    else: phenny.say('Can\'t find anything in Uncyclopedia for "%s".' % origterm)
-
-uncyc.commands = ['uncyc']
-uncyc.priority = 'high'
-
-if __name__ == '__main__': 
-    print(__doc__.strip())
@@ -11,73 +11,34 @@ author: mutantmonkey <mutantmonkey@mutantmonkey.in>
 """

 import re, urllib.request, urllib.parse, urllib.error
-import web
-import json
+import wiki

-wikiapi = 'https://vtluug.org/w/api.php?action=query&list=search&srsearch=%s&limit=1&prop=snippet&format=json'
-wikiuri = 'https://vtluug.org/wiki/%s'
+wikiapi = 'https://vtluug.org/w/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json'
+wikiuri = 'https://vtluug.org/wiki/{0}'
 wikisearch = 'https://vtluug.org/wiki/Special:Search?' \
-                          + 'search=%s&fulltext=Search'
-
-r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
-r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
-r_tag = re.compile(r'<(?!!)[^>]+>')
-r_whitespace = re.compile(r'[\t\r\n ]+')
-r_redirect = re.compile(
-    r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
-)
-
-abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs', 
-            'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit', 
-            'syn', 'transl', 'sess', 'fl', 'Op'] \
-    + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
-    + list('abcdefghijklmnopqrstuvwxyz')
-t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)'
-r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs))
-
-def unescape(s): 
-    s = s.replace('&gt;', '>')
-    s = s.replace('&lt;', '<')
-    s = s.replace('&amp;', '&')
-    s = s.replace('&#160;', ' ')
-    return s
-
-def text(html): 
-    html = r_tag.sub('', html)
-    html = r_whitespace.sub(' ', html)
-    return unescape(html).strip()
-
-def vtluugwiki(term, last=False): 
-    global wikiapi, wikiuri
-    url = wikiapi % term
-    bytes = web.get(url)
-    result = json.loads(bytes)
-    result = result['query']['search']
-    if len(result) <= 0:
-        return None
-    term = result[0]['title']
-    term = term.replace(' ', '_')
-    snippet = text(result[0]['snippet'])
-    return "%s - %s" % (snippet, wikiuri % term)
+                          + 'search={0}&fulltext=Search'

 def vtluug(phenny, input): 
    origterm = input.groups()[1]
    if not origterm: 
-        return phenny.say('Perhaps you meant ".vtluug Zen"?')
-    origterm = origterm
+        return phenny.say('Perhaps you meant ".vtluug VT-Wireless"?')

    term = urllib.parse.unquote(origterm)
    term = term[0].upper() + term[1:]
    term = term.replace(' ', '_')

-    try: result = vtluugwiki(term)
+    w = wiki.Wiki(wikiapi, wikiuri, wikisearch)
+
+    try:
+        result = w.search(term)
    except IOError: 
-        error = "Can't connect to vtluug.org (%s)" % (wikiuri % term)
+        error = "Can't connect to vtluug.org ({0})".format(wikiuri.format(term))
        return phenny.say(error)

    if result is not None: 
        phenny.say(result)
-    else: phenny.say('Can\'t find anything in the VTLUUG Wiki for "%s".' % origterm)
+    else:
+        phenny.say('Can\'t find anything in the VTLUUG Wiki for "{0}".'.format(origterm))

 vtluug.commands = ['vtluug']
 vtluug.priority = 'high'
@@ -8,73 +8,34 @@ http://inamidst.com/phenny/
 """

 import re, urllib.request, urllib.parse, urllib.error, gzip, io
-import web
-import json
+import wiki

-wikiapi = 'http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=%s&limit=1&prop=snippet&format=json'
-wikiuri = 'http://en.wikipedia.org/wiki/%s'
+wikiapi = 'http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json'
+wikiuri = 'http://en.wikipedia.org/wiki/{0}'
 wikisearch = 'http://en.wikipedia.org/wiki/Special:Search?' \
-                          + 'search=%s&fulltext=Search'
-
-r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
-r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
-r_tag = re.compile(r'<(?!!)[^>]+>')
-r_whitespace = re.compile(r'[\t\r\n ]+')
-r_redirect = re.compile(
-    r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
-)
-
-abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs', 
-         'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit', 
-         'syn', 'transl', 'sess', 'fl', 'Op', 'Dec', 'Brig', 'Gen'] \
-   + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
-   + list('abcdefghijklmnopqrstuvwxyz')
-t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)'
-r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs))
-
-def unescape(s): 
-    s = s.replace('&gt;', '>')
-    s = s.replace('&lt;', '<')
-    s = s.replace('&amp;', '&')
-    s = s.replace('&#160;', ' ')
-    return s
-
-def text(html): 
-    html = r_tag.sub('', html)
-    html = r_whitespace.sub(' ', html)
-    return unescape(html).strip()
-
-def wikipedia(term, last=False): 
-    global wikiapi, wikiuri
-    url = wikiapi % term
-    bytes = web.get(url)
-    result = json.loads(bytes)
-    result = result['query']['search']
-    if len(result) <= 0:
-        return None
-    term = result[0]['title']
-    term = term.replace(' ', '_')
-    snippet = text(result[0]['snippet'])
-    return "%s - %s" % (snippet, wikiuri % term)
+                          + 'search={0}&fulltext=Search'

 def wik(phenny, input): 
    origterm = input.groups()[1]
    if not origterm: 
        return phenny.say('Perhaps you meant ".wik Zen"?')
-    origterm = origterm

    term = urllib.parse.unquote(origterm)
    term = term[0].upper() + term[1:]
    term = term.replace(' ', '_')

-    try: result = wikipedia(term)
+    w = wiki.Wiki(wikiapi, wikiuri, wikisearch)
+
+    try:
+        result = w.search(term)
    except IOError: 
-        error = "Can't connect to en.wikipedia.org (%s)" % (wikiuri % term)
+        error = "Can't connect to en.wikipedia.org ({0})".format(wikiuri.format(term))
        return phenny.say(error)

    if result is not None: 
        phenny.say(result)
-    else: phenny.say('Can\'t find anything in Wikipedia for "%s".' % origterm)
+    else:
+        phenny.say('Can\'t find anything in Wikipedia for "{0}".'.format(origterm))

 wik.commands = ['wik']
 wik.priority = 'high'
@@ -0,0 +1,54 @@
+import json
+import re
+import web
+
+
+r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
+r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
+r_tag = re.compile(r'<(?!!)[^>]+>')
+r_whitespace = re.compile(r'[\t\r\n ]+')
+r_redirect = re.compile(
+    r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
+)
+
+abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs', 
+         'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit', 
+         'syn', 'transl', 'sess', 'fl', 'Op', 'Dec', 'Brig', 'Gen'] \
+   + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
+   + list('abcdefghijklmnopqrstuvwxyz')
+t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)'
+r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs))
+
+
+class Wiki(object):
+    def __init__(self, api, url, searchurl=""):
+        self.api = api
+        self.url = url
+        self.searchurl = searchurl
+
+    @staticmethod
+    def unescape(s): 
+        s = s.replace('&gt;', '>')
+        s = s.replace('&lt;', '<')
+        s = s.replace('&amp;', '&')
+        s = s.replace('&#160;', ' ')
+        return s
+
+    @staticmethod
+    def text(html): 
+        html = r_tag.sub('', html)
+        html = r_whitespace.sub(' ', html)
+        return Wiki.unescape(html).strip()
+
+    def search(self, term, last=False):
+        url = self.api.format(term)
+        bytes = web.get(url)
+        result = json.loads(bytes)
+        result = result['query']['search']
+        if len(result) <= 0:
+            return None
+        term = result[0]['title']
+        term = term.replace(' ', '_')
+        snippet = self.text(result[0]['snippet'])
+        return "{0} - {1}".format(snippet, self.url.format(term))
+