2012-06-14 00:58:31 -04:00
|
|
|
|
import json
|
2018-03-16 09:27:18 -04:00
|
|
|
|
import lxml.html
|
2012-06-14 00:58:31 -04:00
|
|
|
|
import re
|
2018-03-16 09:27:18 -04:00
|
|
|
|
from requests.exceptions import HTTPError
|
|
|
|
|
from urllib.parse import quote, unquote
|
2012-06-14 00:58:31 -04:00
|
|
|
|
import web
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
r_tr = re.compile(r'(?ims)<tr[^>]*>.*?</tr>')
|
|
|
|
|
r_paragraph = re.compile(r'(?ims)<p[^>]*>.*?</p>|<li(?!n)[^>]*>.*?</li>')
|
|
|
|
|
r_tag = re.compile(r'<(?!!)[^>]+>')
|
|
|
|
|
r_whitespace = re.compile(r'[\t\r\n ]+')
|
|
|
|
|
r_redirect = re.compile(
|
|
|
|
|
r'(?ims)class=.redirectText.>\s*<a\s*href=./wiki/([^"/]+)'
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
abbrs = ['etc', 'ca', 'cf', 'Co', 'Ltd', 'Inc', 'Mt', 'Mr', 'Mrs',
|
|
|
|
|
'Dr', 'Ms', 'Rev', 'Fr', 'St', 'Sgt', 'pron', 'approx', 'lit',
|
|
|
|
|
'syn', 'transl', 'sess', 'fl', 'Op', 'Dec', 'Brig', 'Gen'] \
|
|
|
|
|
+ list('ABCDEFGHIJKLMNOPQRSTUVWXYZ') \
|
|
|
|
|
+ list('abcdefghijklmnopqrstuvwxyz')
|
2018-03-16 09:27:18 -04:00
|
|
|
|
no_abbr = ''.join('(?<! ' + abbr + ')' for abbr in abbrs)
|
|
|
|
|
breaks = re.compile('({})+'.format('|'.join([
|
|
|
|
|
no_abbr + '[.!?](?:[ \n]|\[[0-9]+\]|$)',
|
|
|
|
|
'。', '。', '.', '!', '?',
|
|
|
|
|
])))
|
2012-06-14 00:58:31 -04:00
|
|
|
|
|
2018-03-16 09:27:18 -04:00
|
|
|
|
def format_term(term):
|
|
|
|
|
term = term.replace(' ', '_')
|
|
|
|
|
term = term[0].upper() + term[1:]
|
|
|
|
|
return term
|
|
|
|
|
|
|
|
|
|
def deformat_term(term):
|
|
|
|
|
term = term.replace('_', ' ')
|
|
|
|
|
return term
|
|
|
|
|
|
|
|
|
|
def format_section(section):
|
|
|
|
|
section = section.replace(' ', '_')
|
|
|
|
|
section = quote(section)
|
|
|
|
|
section = section.replace('%', '.')
|
|
|
|
|
section = section.replace(".3A", ":")
|
|
|
|
|
return section
|
|
|
|
|
|
|
|
|
|
def parse_term(origterm):
|
|
|
|
|
if "#" in origterm:
|
|
|
|
|
term, section = origterm.split("#")[:2]
|
|
|
|
|
term, section = term.strip(), section.strip()
|
|
|
|
|
else:
|
|
|
|
|
term = origterm.strip()
|
|
|
|
|
section = None
|
|
|
|
|
|
|
|
|
|
return (term, section)
|
|
|
|
|
|
|
|
|
|
def good_content(text, content):
|
|
|
|
|
if text.tag not in ['p', 'ul', 'ol']:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
if not content.strip():
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
if not breaks.search(content):
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
if text.find(".//span[@id='coordinates']") is not None:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def search_content(text):
|
|
|
|
|
if text is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
content = text.text_content()
|
|
|
|
|
|
|
|
|
|
while not good_content(text, content):
|
|
|
|
|
text = text.getnext()
|
|
|
|
|
|
|
|
|
|
if text is None:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
content = text.text_content()
|
|
|
|
|
|
|
|
|
|
return content
|
|
|
|
|
|
|
|
|
|
def extract_snippet(match, origsection=None):
|
|
|
|
|
html, url = match
|
|
|
|
|
page = lxml.html.fromstring(html)
|
|
|
|
|
article = page.get_element_by_id('mw-content-text')
|
|
|
|
|
|
|
|
|
|
if origsection:
|
|
|
|
|
section = format_section(origsection)
|
|
|
|
|
text = article.find(".//span[@id='{0}']".format(section))
|
|
|
|
|
url += "#" + unquote(section)
|
|
|
|
|
|
|
|
|
|
if text is None:
|
|
|
|
|
return ("No '{0}' section found.".format(origsection), url)
|
|
|
|
|
|
|
|
|
|
text = text.getparent().getnext()
|
|
|
|
|
content = search_content(text)
|
|
|
|
|
|
|
|
|
|
if text is None:
|
|
|
|
|
return ("No section text found.", url)
|
|
|
|
|
else:
|
|
|
|
|
text = article.find('./p')
|
|
|
|
|
|
|
|
|
|
if text is None:
|
|
|
|
|
text = article.find('./div/p')
|
|
|
|
|
|
|
|
|
|
content = search_content(text)
|
|
|
|
|
|
|
|
|
|
if text is None:
|
|
|
|
|
return ("No introduction text found.", url)
|
|
|
|
|
|
|
|
|
|
sentences = [x.strip() for x in breaks.split(content)]
|
|
|
|
|
return (sentences[0], url)
|
2012-06-14 00:58:31 -04:00
|
|
|
|
|
|
|
|
|
class Wiki(object):
|
2018-03-16 09:27:18 -04:00
|
|
|
|
def __init__(self, endpoints):
|
|
|
|
|
self.endpoints = endpoints
|
2012-06-14 00:58:31 -04:00
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def unescape(s):
|
|
|
|
|
s = s.replace('>', '>')
|
|
|
|
|
s = s.replace('<', '<')
|
|
|
|
|
s = s.replace('&', '&')
|
|
|
|
|
s = s.replace(' ', ' ')
|
2017-02-21 14:45:50 -05:00
|
|
|
|
s = s.replace('"', '"')
|
2012-06-14 00:58:31 -04:00
|
|
|
|
return s
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
def text(html):
|
|
|
|
|
html = r_tag.sub('', html)
|
|
|
|
|
html = r_whitespace.sub(' ', html)
|
|
|
|
|
return Wiki.unescape(html).strip()
|
|
|
|
|
|
2018-03-16 09:27:18 -04:00
|
|
|
|
def search(self, term):
|
|
|
|
|
try:
|
|
|
|
|
exactterm = format_term(term)
|
|
|
|
|
exactterm = quote(exactterm)
|
|
|
|
|
exacturl = self.endpoints['url'].format(exactterm)
|
|
|
|
|
html = web.get(exacturl)
|
|
|
|
|
return (html, exacturl)
|
|
|
|
|
except HTTPError:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
term = deformat_term(term)
|
|
|
|
|
term = quote(term)
|
|
|
|
|
apiurl = self.endpoints['api'].format(term)
|
|
|
|
|
|
2012-06-24 00:55:37 -04:00
|
|
|
|
try:
|
2018-03-16 09:27:18 -04:00
|
|
|
|
result = json.loads(web.get(apiurl))
|
2012-06-24 00:55:37 -04:00
|
|
|
|
except ValueError:
|
2012-06-14 00:58:31 -04:00
|
|
|
|
return None
|
2018-03-16 09:27:18 -04:00
|
|
|
|
|
|
|
|
|
result = result['query']['search']
|
|
|
|
|
|
|
|
|
|
if not result:
|
|
|
|
|
return None
|
|
|
|
|
|
2012-06-14 00:58:31 -04:00
|
|
|
|
term = result[0]['title']
|
2018-03-16 09:27:18 -04:00
|
|
|
|
term = format_term(term)
|
|
|
|
|
term = quote(term)
|
2012-06-14 00:58:31 -04:00
|
|
|
|
|
2018-03-16 09:27:18 -04:00
|
|
|
|
url = self.endpoints['url'].format(term)
|
|
|
|
|
html = web.get(url)
|
|
|
|
|
return (html, url)
|