Refactor Wikipedia modules

master
Robin Richtsfeld 2018-03-16 14:27:18 +01:00
parent 2153d27b1b
commit e91f3bd16b
4 changed files with 173 additions and 76 deletions

View File

@ -10,36 +10,33 @@ modified from Wikipedia module
author: mutantmonkey <mutantmonkey@mutantmonkey.in> author: mutantmonkey <mutantmonkey@mutantmonkey.in>
""" """
import re
import web
import wiki import wiki
wikiapi = 'https://wiki.archlinux.org/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json' endpoints = {
wikiuri = 'https://wiki.archlinux.org/index.php/{0}' 'api': 'https://wiki.archlinux.org/api.php?action=query&list=search&srsearch={0}&limit=1&format=json',
wikisearch = 'https://wiki.archlinux.org/index.php/Special:Search?' \ 'url': 'https://wiki.archlinux.org/index.php/{0}',
+ 'search={0}&fulltext=Search' 'search': 'https://wiki.archlinux.org/index.php/Special:Search?search={0}&fulltext=Search',
}
def awik(phenny, input): def awik(phenny, input):
origterm = input.groups()[1] """.awik <term> - Look up something on the ArchWiki."""
origterm = input.group(1)
if not origterm: if not origterm:
return phenny.say('Perhaps you meant ".awik dwm"?') return phenny.say('Perhaps you meant ".awik dwm"?')
term = web.unquote(origterm) term, section = wiki.parse_term(origterm)
term = term[0].upper() + term[1:]
term = term.replace(' ', '_')
w = wiki.Wiki(wikiapi, wikiuri, wikisearch) w = wiki.Wiki(endpoints)
match = w.search(term)
try: if not match:
result = w.search(term) phenny.say('Can\'t find anything in the ArchWiki for "{0}".'.format(term))
except web.ConnectionError: return
error = "Can't connect to wiki.archlinux.org ({0})".format(wikiuri.format(term))
return phenny.say(error)
if result is not None: snippet, url = wiki.extract_snippet(match, section)
phenny.say(result)
else: phenny.say('"{0}" - {1}'.format(snippet, url))
phenny.say('Can\'t find anything in the ArchWiki for "{0}".'.format(origterm))
awik.commands = ['awik'] awik.commands = ['awik']
awik.priority = 'high' awik.priority = 'high'

View File

@ -10,14 +10,13 @@ modified from Wikipedia module
author: mutantmonkey <mutantmonkey@mutantmonkey.in> author: mutantmonkey <mutantmonkey@mutantmonkey.in>
""" """
import re
import web
import wiki import wiki
wikiapi = 'https://vtluug.org/w/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json' endpoints = {
wikiuri = 'https://vtluug.org/wiki/{0}' 'api': 'https://vtluug.org/w/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json',
wikisearch = 'https://vtluug.org/wiki/Special:Search?' \ 'url': 'https://vtluug.org/wiki/{0}',
+ 'search={0}&fulltext=Search' 'search': 'https://vtluug.org/wiki/Special:Search?search={0}&fulltext=Search',
}
def vtluug(phenny, input): def vtluug(phenny, input):
""".vtluug <term> - Look up something on the VTLUUG wiki.""" """.vtluug <term> - Look up something on the VTLUUG wiki."""
@ -26,22 +25,19 @@ def vtluug(phenny, input):
if not origterm: if not origterm:
return phenny.say('Perhaps you meant ".vtluug VT-Wireless"?') return phenny.say('Perhaps you meant ".vtluug VT-Wireless"?')
term = web.unquote(origterm) term, section = wiki.parse_term(origterm)
term = term[0].upper() + term[1:]
term = term.replace(' ', '_')
w = wiki.Wiki(wikiapi, wikiuri, wikisearch) w = wiki.Wiki(endpoints)
match = w.search(term)
try: if not match:
result = w.search(term) phenny.say('Can\'t find anything in the VTLUUG Wiki for "{0}".'.format(term))
except web.ConnectionError: return
error = "Can't connect to vtluug.org ({0})".format(wikiuri.format(term))
return phenny.say(error) snippet, url = wiki.extract_snippet(match, section)
phenny.say('"{0}" - {1}'.format(snippet, url))
if result is not None:
phenny.say(result)
else:
phenny.say('Can\'t find anything in the VTLUUG Wiki for "{0}".'.format(origterm))
vtluug.commands = ['vtluug'] vtluug.commands = ['vtluug']
vtluug.priority = 'high' vtluug.priority = 'high'

View File

@ -7,14 +7,13 @@ Licensed under the Eiffel Forum License 2.
http://inamidst.com/phenny/ http://inamidst.com/phenny/
""" """
import re
import web
import wiki import wiki
wikiapi = 'https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={0}&limit=1&prop=snippet&format=json' endpoints = {
wikiuri = 'https://en.wikipedia.org/wiki/{0}' 'api': 'https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch={0}&prop=snippet&limit=1',
wikisearch = 'https://en.wikipedia.org/wiki/Special:Search?' \ 'url': 'https://en.wikipedia.org/wiki/{0}',
+ 'search={0}&fulltext=Search' 'search': 'https://en.wikipedia.org/wiki/Special:Search?search={0}&fulltext=Search',
}
def wik(phenny, input): def wik(phenny, input):
""".wik <term> - Look up something on Wikipedia.""" """.wik <term> - Look up something on Wikipedia."""
@ -23,22 +22,19 @@ def wik(phenny, input):
if not origterm: if not origterm:
return phenny.say('Perhaps you meant ".wik Zen"?') return phenny.say('Perhaps you meant ".wik Zen"?')
term = web.unquote(origterm) origterm = origterm.strip()
term = term[0].upper() + term[1:] term, section = wiki.parse_term(origterm)
term = term.replace(' ', '_')
w = wiki.Wiki(wikiapi, wikiuri, wikisearch) w = wiki.Wiki(endpoints)
match = w.search(term)
try: if not match:
result = w.search(term)
except web.ConnectionError:
error = "Can't connect to en.wikipedia.org ({0})".format(wikiuri.format(term))
return phenny.say(error)
if result is not None:
phenny.say(result)
else:
phenny.say('Can\'t find anything in Wikipedia for "{0}".'.format(origterm)) phenny.say('Can\'t find anything in Wikipedia for "{0}".'.format(origterm))
return
snippet, url = wiki.extract_snippet(match, section)
phenny.say('"{0}" - {1}'.format(snippet, url))
wik.commands = ['wik'] wik.commands = ['wik']
wik.priority = 'high' wik.priority = 'high'

142
wiki.py
View File

@ -1,5 +1,8 @@
import json import json
import lxml.html
import re import re
from requests.exceptions import HTTPError
from urllib.parse import quote, unquote
import web import web
@ -16,15 +19,104 @@ abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs',
'syn', 'transl', 'sess', 'fl', 'Op', 'Dec', 'Brig', 'Gen'] \ 'syn', 'transl', 'sess', 'fl', 'Op', 'Dec', 'Brig', 'Gen'] \
+ list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \ + list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
+ list('abcdefghijklmnopqrstuvwxyz') + list('abcdefghijklmnopqrstuvwxyz')
t_sentence = r'^.{5,}?(?<!\b%s)(?:\.(?=[\[ ][A-Z0-9]|\Z)|\Z)' no_abbr = ''.join('(?<! ' + abbr + ')' for abbr in abbrs)
r_sentence = re.compile(t_sentence % r')(?<!\b'.join(abbrs)) breaks = re.compile('({})+'.format('|'.join([
no_abbr + '[.!?](?:[ \n]|\[[0-9]+\]|$)',
'', '', '', '', '',
])))
def format_term(term):
term = term.replace(' ', '_')
term = term[0].upper() + term[1:]
return term
def deformat_term(term):
term = term.replace('_', ' ')
return term
def format_section(section):
section = section.replace(' ', '_')
section = quote(section)
section = section.replace('%', '.')
section = section.replace(".3A", ":")
return section
def parse_term(origterm):
if "#" in origterm:
term, section = origterm.split("#")[:2]
term, section = term.strip(), section.strip()
else:
term = origterm.strip()
section = None
return (term, section)
def good_content(text, content):
if text.tag not in ['p', 'ul', 'ol']:
return False
if not content.strip():
return False
if not breaks.search(content):
return False
if text.find(".//span[@id='coordinates']") is not None:
return False
return True
def search_content(text):
if text is None:
return None
content = text.text_content()
while not good_content(text, content):
text = text.getnext()
if text is None:
return None
content = text.text_content()
return content
def extract_snippet(match, origsection=None):
html, url = match
page = lxml.html.fromstring(html)
article = page.get_element_by_id('mw-content-text')
if origsection:
section = format_section(origsection)
text = article.find(".//span[@id='{0}']".format(section))
url += "#" + unquote(section)
if text is None:
return ("No '{0}' section found.".format(origsection), url)
text = text.getparent().getnext()
content = search_content(text)
if text is None:
return ("No section text found.", url)
else:
text = article.find('./p')
if text is None:
text = article.find('./div/p')
content = search_content(text)
if text is None:
return ("No introduction text found.", url)
sentences = [x.strip() for x in breaks.split(content)]
return (sentences[0], url)
class Wiki(object): class Wiki(object):
def __init__(self, api, url, searchurl=""): def __init__(self, endpoints):
self.api = api self.endpoints = endpoints
self.url = url
self.searchurl = searchurl
@staticmethod @staticmethod
def unescape(s): def unescape(s):
@ -41,18 +133,34 @@ class Wiki(object):
html = r_whitespace.sub(' ', html) html = r_whitespace.sub(' ', html)
return Wiki.unescape(html).strip() return Wiki.unescape(html).strip()
def search(self, term, last=False): def search(self, term):
url = self.api.format(term)
bytes = web.get(url)
try: try:
result = json.loads(bytes) exactterm = format_term(term)
result = result['query']['search'] exactterm = quote(exactterm)
if len(result) <= 0: exacturl = self.endpoints['url'].format(exactterm)
return None html = web.get(exacturl)
return (html, exacturl)
except HTTPError:
pass
term = deformat_term(term)
term = quote(term)
apiurl = self.endpoints['api'].format(term)
try:
result = json.loads(web.get(apiurl))
except ValueError: except ValueError:
return None return None
term = result[0]['title']
term = term.replace(' ', '_')
snippet = self.text(result[0]['snippet'])
return "{0} - {1}".format(snippet, self.url.format(term))
result = result['query']['search']
if not result:
return None
term = result[0]['title']
term = format_term(term)
term = quote(term)
url = self.endpoints['url'].format(term)
html = web.get(url)
return (html, url)