Refactor Wikipedia modules

2018-03-16 14:27:18 +01:00
parent 2153d27b1b
commit e91f3bd16b
4 changed files with 173 additions and 76 deletions
--- a/modules/archwiki.py
+++ b/modules/archwiki.py
@@ -10,36 +10,33 @@ modified from Wikipedia module
 author: mutantmonkey <mutantmonkey@mutantmonkey.in>
 """

-import re
-import web
 import wiki

-wikiapi = 'https://wiki.archlinux.org/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json'
-wikiuri = 'https://wiki.archlinux.org/index.php/{0}'
-wikisearch = 'https://wiki.archlinux.org/index.php/Special:Search?' \
-                          + 'search={0}&fulltext=Search'
+endpoints = {
+    'api': 'https://wiki.archlinux.org/api.php?action=query&list=search&srsearch={0}&limit=1&format=json',
+    'url': 'https://wiki.archlinux.org/index.php/{0}',
+    'search': 'https://wiki.archlinux.org/index.php/Special:Search?search={0}&fulltext=Search',
+}

 def awik(phenny, input): 
-    origterm = input.groups()[1]
+    """.awik <term> - Look up something on the ArchWiki."""
+
+    origterm = input.group(1)
    if not origterm:
        return phenny.say('Perhaps you meant ".awik dwm"?')

-    term = web.unquote(origterm)
-    term = term[0].upper() + term[1:]
-    term = term.replace(' ', '_')
+    term, section = wiki.parse_term(origterm)

-    w = wiki.Wiki(wikiapi, wikiuri, wikisearch)
+    w = wiki.Wiki(endpoints)
+    match = w.search(term)

-    try:
-        result = w.search(term)
-    except web.ConnectionError:
-        error = "Can't connect to wiki.archlinux.org ({0})".format(wikiuri.format(term))
-        return phenny.say(error)
+    if not match:
+        phenny.say('Can\'t find anything in the ArchWiki for "{0}".'.format(term))
+        return

-    if result is not None: 
-        phenny.say(result)
-    else:
-        phenny.say('Can\'t find anything in the ArchWiki for "{0}".'.format(origterm))
+    snippet, url = wiki.extract_snippet(match, section)
+
+    phenny.say('"{0}" - {1}'.format(snippet, url))

 awik.commands = ['awik']
 awik.priority = 'high'
--- a/modules/vtluugwiki.py
+++ b/modules/vtluugwiki.py
@@ -10,14 +10,13 @@ modified from Wikipedia module
 author: mutantmonkey <mutantmonkey@mutantmonkey.in>
 """

-import re
-import web
 import wiki

-wikiapi = 'https://vtluug.org/w/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json'
-wikiuri = 'https://vtluug.org/wiki/{0}'
-wikisearch = 'https://vtluug.org/wiki/Special:Search?' \
-                          + 'search={0}&fulltext=Search'
+endpoints = {
+    'api': 'https://vtluug.org/w/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json',
+    'url': 'https://vtluug.org/wiki/{0}',
+    'search': 'https://vtluug.org/wiki/Special:Search?search={0}&fulltext=Search',
+}

 def vtluug(phenny, input): 
    """.vtluug <term> - Look up something on the VTLUUG wiki."""
@@ -26,22 +25,19 @@ def vtluug(phenny, input):
    if not origterm: 
        return phenny.say('Perhaps you meant ".vtluug VT-Wireless"?')

-    term = web.unquote(origterm)
-    term = term[0].upper() + term[1:]
-    term = term.replace(' ', '_')
+    term, section = wiki.parse_term(origterm)

-    w = wiki.Wiki(wikiapi, wikiuri, wikisearch)
+    w = wiki.Wiki(endpoints)
+    match = w.search(term)

-    try:
-        result = w.search(term)
-    except web.ConnectionError:
-        error = "Can't connect to vtluug.org ({0})".format(wikiuri.format(term))
-        return phenny.say(error)
+    if not match:
+        phenny.say('Can\'t find anything in the VTLUUG Wiki for "{0}".'.format(term))
+        return
+
+    snippet, url = wiki.extract_snippet(match, section)
+
+    phenny.say('"{0}" - {1}'.format(snippet, url))

-    if result is not None: 
-        phenny.say(result)
-    else:
-        phenny.say('Can\'t find anything in the VTLUUG Wiki for "{0}".'.format(origterm))
 vtluug.commands = ['vtluug']
 vtluug.priority = 'high'

--- a/modules/wikipedia.py
+++ b/modules/wikipedia.py
@@ -7,14 +7,13 @@ Licensed under the Eiffel Forum License 2.
 http://inamidst.com/phenny/
 """

-import re
-import web
 import wiki

-wikiapi = 'https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json'
-wikiuri = 'https://en.wikipedia.org/wiki/{0}'
-wikisearch = 'https://en.wikipedia.org/wiki/Special:Search?' \
-                          + 'search={0}&fulltext=Search'
+endpoints = {
+    'api': 'https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch={0}&prop=snippet&limit=1',
+    'url': 'https://en.wikipedia.org/wiki/{0}',
+    'search': 'https://en.wikipedia.org/wiki/Special:Search?search={0}&fulltext=Search',
+}

 def wik(phenny, input): 
    """.wik <term> - Look up something on Wikipedia."""
@@ -23,22 +22,19 @@ def wik(phenny, input):
    if not origterm: 
        return phenny.say('Perhaps you meant ".wik Zen"?')

-    term = web.unquote(origterm)
-    term = term[0].upper() + term[1:]
-    term = term.replace(' ', '_')
+    origterm = origterm.strip()
+    term, section = wiki.parse_term(origterm)

-    w = wiki.Wiki(wikiapi, wikiuri, wikisearch)
+    w = wiki.Wiki(endpoints)
+    match = w.search(term)

-    try:
-        result = w.search(term)
-    except web.ConnectionError:
-        error = "Can't connect to en.wikipedia.org ({0})".format(wikiuri.format(term))
-        return phenny.say(error)
-
-    if result is not None: 
-        phenny.say(result)
-    else:
+    if not match:
        phenny.say('Can\'t find anything in Wikipedia for "{0}".'.format(origterm))
+        return
+
+    snippet, url = wiki.extract_snippet(match, section)
+
+    phenny.say('"{0}" - {1}'.format(snippet, url))

 wik.commands = ['wik']
 wik.priority = 'high'
--- a/wiki.py
+++ b/wiki.py
@@ -1,5 +1,8 @@
 import json
+import lxml.html
 import re
+from requests.exceptions import HTTPError
+from urllib.parse import quote, unquote
 import web


@@ -16,15 +19,104 @@ abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs',
         'syn', 'transl', 'sess', 'fl', 'Op', 'Dec', 'Brig', 'Gen'] \
   + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
   + list('abcdefghijklmnopqrstuvwxyz')
-t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)'
-r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs))
+no_abbr = ''.join('(?<! ' + abbr + ')' for abbr in abbrs)
+breaks = re.compile('({})+'.format('|'.join([
+    no_abbr + '[.!?](?:[ \n]|\[[0-9]+\]|$)',
+    '。', '｡', '．', '！', '？',
+])))

+def format_term(term):
+    term = term.replace(' ', '_')
+    term = term[0].upper() + term[1:]
+    return term
+
+def deformat_term(term):
+    term = term.replace('_', ' ')
+    return term
+
+def format_section(section):
+    section = section.replace(' ', '_')
+    section = quote(section)
+    section = section.replace('%', '.')
+    section = section.replace(".3A", ":")
+    return section
+
+def parse_term(origterm):
+    if "#" in origterm:
+        term, section = origterm.split("#")[:2]
+        term, section = term.strip(), section.strip()
+    else:
+        term = origterm.strip()
+        section = None
+
+    return (term, section)
+
+def good_content(text, content):
+    if text.tag not in ['p', 'ul', 'ol']:
+        return False
+
+    if not content.strip():
+        return False
+
+    if not breaks.search(content):
+        return False
+
+    if text.find(".//span[@id='coordinates']") is not None:
+        return False
+
+    return True
+
+def search_content(text):
+    if text is None:
+        return None
+
+    content = text.text_content()
+
+    while not good_content(text, content):
+        text = text.getnext()
+
+        if text is None:
+            return None
+
+        content = text.text_content()
+
+    return content
+
+def extract_snippet(match, origsection=None):
+    html, url = match
+    page = lxml.html.fromstring(html)
+    article = page.get_element_by_id('mw-content-text')
+
+    if origsection:
+        section = format_section(origsection)
+        text = article.find(".//span[@id='{0}']".format(section))
+        url += "#" + unquote(section)
+
+        if text is None:
+            return ("No '{0}' section found.".format(origsection), url)
+
+        text = text.getparent().getnext()
+        content = search_content(text)
+
+        if text is None:
+            return ("No section text found.", url)
+    else:
+        text = article.find('./p')
+
+        if text is None:
+            text = article.find('./div/p')
+
+        content = search_content(text)
+
+        if text is None:
+            return ("No introduction text found.", url)
+
+    sentences = [x.strip() for x in breaks.split(content)]
+    return (sentences[0], url)

 class Wiki(object):
-    def __init__(self, api, url, searchurl=""):
-        self.api = api
-        self.url = url
-        self.searchurl = searchurl
+    def __init__(self, endpoints):
+        self.endpoints = endpoints

    @staticmethod
    def unescape(s): 
@@ -41,18 +133,34 @@ class Wiki(object):
        html = r_whitespace.sub(' ', html)
        return Wiki.unescape(html).strip()

-    def search(self, term, last=False):
-        url = self.api.format(term)
-        bytes = web.get(url)
+    def search(self, term):
        try:
-            result = json.loads(bytes)
-            result = result['query']['search']
-            if len(result) <= 0:
-                return None
+            exactterm = format_term(term)
+            exactterm = quote(exactterm)
+            exacturl = self.endpoints['url'].format(exactterm)
+            html = web.get(exacturl)
+            return (html, exacturl)
+        except HTTPError:
+            pass
+
+        term = deformat_term(term)
+        term = quote(term)
+        apiurl = self.endpoints['api'].format(term)
+
+        try:
+            result = json.loads(web.get(apiurl))
        except ValueError:
            return None
-        term = result[0]['title']
-        term = term.replace(' ', '_')
-        snippet = self.text(result[0]['snippet'])
-        return "{0} - {1}".format(snippet, self.url.format(term))

+        result = result['query']['search']
+
+        if not result:
+            return None
+
+        term = result[0]['title']
+        term = format_term(term)
+        term = quote(term)
+
+        url = self.endpoints['url'].format(term)
+        html = web.get(url)
+        return (html, url)