fix wiktionary and add tests
parent
b272cfd0f9
commit
da77b275e0
|
@ -0,0 +1,29 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
test_wiktionary.py - tests for the wiktionary module
|
||||||
|
author: mutantmonkey <mutantmonkey@mutantmonkey.in>
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import unittest
|
||||||
|
from mock import MagicMock, Mock
|
||||||
|
from modules import wiktionary
|
||||||
|
|
||||||
|
|
||||||
|
class TestWiktionary(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.phenny = MagicMock()
|
||||||
|
|
||||||
|
def test_wiktionary(self):
|
||||||
|
w = wiktionary.wiktionary('test')
|
||||||
|
|
||||||
|
assert len(w[0]) > 0
|
||||||
|
assert len(w[1]) > 0
|
||||||
|
|
||||||
|
def test_w(self):
|
||||||
|
input = Mock(group=lambda x: 'test')
|
||||||
|
wiktionary.w(self.phenny, input)
|
||||||
|
|
||||||
|
out = self.phenny.say.call_args[0][0]
|
||||||
|
m = re.match('^test — noun: .*$', out, flags=re.UNICODE)
|
||||||
|
self.assertTrue(m)
|
|
@ -9,52 +9,66 @@ http://inamidst.com/phenny/
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import web
|
import web
|
||||||
|
import json
|
||||||
|
|
||||||
uri = 'http://en.wiktionary.org/w/index.php?title=%s&printable=yes'
|
uri = 'http://en.wiktionary.org/w/index.php?title=%s&printable=yes'
|
||||||
r_tag = re.compile(r'<[^>]+>')
|
wikiapi = 'http://en.wiktionary.org/w/api.php?action=query&titles={0}&prop=revisions&rvprop=content&format=json'
|
||||||
|
#r_tag = re.compile(r'<[^>]+>')
|
||||||
r_ul = re.compile(r'(?ims)<ul>.*?</ul>')
|
r_ul = re.compile(r'(?ims)<ul>.*?</ul>')
|
||||||
|
r_li = re.compile(r'^# ')
|
||||||
|
r_img = re.compile(r'\[\[Image:.*\]\]')
|
||||||
|
r_link1 = re.compile(r'\[\[([A-Za-z0-9\-_ ]+?)\]\]')
|
||||||
|
r_link2 = re.compile(r'\[\[([A-Za-z0-9\-_ ]+?)\|(.+?)\]\]')
|
||||||
|
r_context = re.compile(r'{{context\|(.+?)}}')
|
||||||
|
r_template1 = re.compile(r'{{.+?\|(.+?)}}')
|
||||||
|
r_template2 = re.compile(r'{{(.+?)}}')
|
||||||
|
|
||||||
def text(html):
|
def text(html):
|
||||||
text = r_tag.sub('', html).strip()
|
text = r_li.sub('', html).strip()
|
||||||
text = text.replace('\n', ' ')
|
text = r_img.sub('', text)
|
||||||
text = text.replace('\r', '')
|
text = r_link1.sub(r'\1', text)
|
||||||
text = text.replace('(intransitive', '(intr.')
|
text = r_link2.sub(r'\2', text)
|
||||||
text = text.replace('(transitive', '(trans.')
|
text = r_context.sub(r'\1:', text)
|
||||||
|
text = r_template1.sub(r'\1:', text)
|
||||||
|
text = r_template2.sub(r'\1:', text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def wiktionary(word):
|
def wiktionary(word):
|
||||||
bytes = web.get(uri % web.quote(word))
|
bytes = web.get(wikiapi.format(web.quote(word)))
|
||||||
bytes = r_ul.sub('', bytes)
|
pages = json.loads(bytes)
|
||||||
|
pages = pages['query']['pages']
|
||||||
|
pg = next(iter(pages))
|
||||||
|
result = pages[pg]['revisions'][0]['*']
|
||||||
|
|
||||||
mode = None
|
mode = None
|
||||||
etymology = None
|
etymology = None
|
||||||
definitions = {}
|
definitions = {}
|
||||||
for line in bytes.splitlines():
|
for line in result.splitlines():
|
||||||
if 'id="Etymology"' in line:
|
if line == '===Etymology===':
|
||||||
mode = 'etymology'
|
mode = 'etymology'
|
||||||
elif 'id="Noun"' in line:
|
elif 'Noun' in line:
|
||||||
mode = 'noun'
|
mode = 'noun'
|
||||||
elif 'id="Verb"' in line:
|
elif 'Verb' in line:
|
||||||
mode = 'verb'
|
mode = 'verb'
|
||||||
elif 'id="Adjective"' in line:
|
elif 'Adjective' in line:
|
||||||
mode = 'adjective'
|
mode = 'adjective'
|
||||||
elif 'id="Adverb"' in line:
|
elif 'Adverb' in line:
|
||||||
mode = 'adverb'
|
mode = 'adverb'
|
||||||
elif 'id="Interjection"' in line:
|
elif 'Interjection' in line:
|
||||||
mode = 'interjection'
|
mode = 'interjection'
|
||||||
elif 'id="Particle"' in line:
|
elif 'Particle' in line:
|
||||||
mode = 'particle'
|
mode = 'particle'
|
||||||
elif 'id="Preposition"' in line:
|
elif 'Preposition' in line:
|
||||||
mode = 'preposition'
|
mode = 'preposition'
|
||||||
elif 'id="' in line:
|
elif len(line) == 0:
|
||||||
mode = None
|
mode = None
|
||||||
|
|
||||||
elif (mode == 'etmyology') and ('<p>' in line):
|
elif mode == 'etymology':
|
||||||
etymology = text(line)
|
etymology = text(line)
|
||||||
elif (mode is not None) and ('<li>' in line):
|
elif mode is not None and '#' in line:
|
||||||
definitions.setdefault(mode, []).append(text(line))
|
definitions.setdefault(mode, []).append(text(line))
|
||||||
|
|
||||||
if '<hr' in line:
|
if '====Synonyms====' in line:
|
||||||
break
|
break
|
||||||
return etymology, definitions
|
return etymology, definitions
|
||||||
|
|
||||||
|
@ -92,9 +106,5 @@ def w(phenny, input):
|
||||||
w.commands = ['w']
|
w.commands = ['w']
|
||||||
w.example = '.w bailiwick'
|
w.example = '.w bailiwick'
|
||||||
|
|
||||||
def encarta(phenny, input):
|
|
||||||
return phenny.reply('Microsoft removed Encarta, try .w instead!')
|
|
||||||
encarta.commands = ['dict']
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print(__doc__.strip())
|
print(__doc__.strip())
|
||||||
|
|
Loading…
Reference in New Issue