import json import lxml.html import re from requests.exceptions import HTTPError from urllib.parse import quote, unquote import web r_tr = re.compile(r'(?ims)]*>.*?') r_paragraph = re.compile(r'(?ims)]*>.*?

|]*>.*?') r_tag = re.compile(r'<(?!!)[^>]+>') r_whitespace = re.compile(r'[\t\r\n ]+') r_redirect = re.compile( r'(?ims)class=.redirectText.>\s*') s = s.replace('<', '<') s = s.replace('&', '&') s = s.replace(' ', ' ') s = s.replace('"', '"') return s @staticmethod def text(html): html = r_tag.sub('', html) html = r_whitespace.sub(' ', html) return Wiki.unescape(html).strip() def search(self, term): try: exactterm = format_term(term) exactterm = quote(exactterm) exacturl = self.endpoints['url'].format(exactterm) html = web.get(exacturl) return (html, exacturl) except HTTPError: pass term = deformat_term(term) term = quote(term) apiurl = self.endpoints['api'].format(term) try: result = json.loads(web.get(apiurl)) except ValueError: return None result = result['query']['search'] if not result: return None term = result[0]['title'] term = format_term(term) term = quote(term) url = self.endpoints['url'].format(term) html = web.get(url) return (html, url)