import json
import lxml.html
import re
from requests.exceptions import HTTPError
from urllib.parse import quote, unquote
import web
r_tr = re.compile(r'(?ims)
]*>.*?
')
r_paragraph = re.compile(r'(?ims)]*>.*?
|]*>.*?')
r_tag = re.compile(r'<(?!!)[^>]+>')
r_whitespace = re.compile(r'[\t\r\n ]+')
r_redirect = re.compile(
r'(?ims)class=.redirectText.>\s*')
s = s.replace('<', '<')
s = s.replace('&', '&')
s = s.replace(' ', ' ')
s = s.replace('"', '"')
return s
@staticmethod
def text(html):
html = r_tag.sub('', html)
html = r_whitespace.sub(' ', html)
return Wiki.unescape(html).strip()
def search(self, term):
try:
exactterm = format_term(term)
exactterm = quote(exactterm)
exacturl = self.endpoints['url'].format(exactterm)
html = web.get(exacturl)
return (html, exacturl)
except HTTPError:
pass
term = deformat_term(term)
term = quote(term)
apiurl = self.endpoints['api'].format(term)
try:
result = json.loads(web.get(apiurl))
except ValueError:
return None
result = result['query']['search']
if not result:
return None
term = result[0]['title']
term = format_term(term)
term = quote(term)
url = self.endpoints['url'].format(term)
html = web.get(url)
return (html, url)