fix wiktionary and add tests

master
mutantmonkey 2012-06-01 21:01:56 -07:00
parent b272cfd0f9
commit da77b275e0
2 changed files with 64 additions and 25 deletions

View File

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
"""
test_wiktionary.py - tests for the wiktionary module
author: mutantmonkey <mutantmonkey@mutantmonkey.in>
"""
import re
import unittest
from mock import MagicMock, Mock
from modules import wiktionary
class TestWiktionary(unittest.TestCase):
def setUp(self):
self.phenny = MagicMock()
def test_wiktionary(self):
w = wiktionary.wiktionary('test')
assert len(w[0]) > 0
assert len(w[1]) > 0
def test_w(self):
input = Mock(group=lambda x: 'test')
wiktionary.w(self.phenny, input)
out = self.phenny.say.call_args[0][0]
m = re.match('^test — noun: .*$', out, flags=re.UNICODE)
self.assertTrue(m)

View File

@ -9,52 +9,66 @@ http://inamidst.com/phenny/
import re import re
import web import web
import json
uri = 'http://en.wiktionary.org/w/index.php?title=%s&printable=yes' uri = 'http://en.wiktionary.org/w/index.php?title=%s&printable=yes'
r_tag = re.compile(r'<[^>]+>') wikiapi = 'http://en.wiktionary.org/w/api.php?action=query&titles={0}&prop=revisions&rvprop=content&format=json'
#r_tag = re.compile(r'<[^>]+>')
r_ul = re.compile(r'(?ims)<ul>.*?</ul>') r_ul = re.compile(r'(?ims)<ul>.*?</ul>')
r_li = re.compile(r'^# ')
r_img = re.compile(r'\[\[Image:.*\]\]')
r_link1 = re.compile(r'\[\[([A-Za-z0-9\-_ ]+?)\]\]')
r_link2 = re.compile(r'\[\[([A-Za-z0-9\-_ ]+?)\|(.+?)\]\]')
r_context = re.compile(r'{{context\|(.+?)}}')
r_template1 = re.compile(r'{{.+?\|(.+?)}}')
r_template2 = re.compile(r'{{(.+?)}}')
def text(html): def text(html):
text = r_tag.sub('', html).strip() text = r_li.sub('', html).strip()
text = text.replace('\n', ' ') text = r_img.sub('', text)
text = text.replace('\r', '') text = r_link1.sub(r'\1', text)
text = text.replace('(intransitive', '(intr.') text = r_link2.sub(r'\2', text)
text = text.replace('(transitive', '(trans.') text = r_context.sub(r'\1:', text)
text = r_template1.sub(r'\1:', text)
text = r_template2.sub(r'\1:', text)
return text return text
def wiktionary(word): def wiktionary(word):
bytes = web.get(uri % web.quote(word)) bytes = web.get(wikiapi.format(web.quote(word)))
bytes = r_ul.sub('', bytes) pages = json.loads(bytes)
pages = pages['query']['pages']
pg = next(iter(pages))
result = pages[pg]['revisions'][0]['*']
mode = None mode = None
etymology = None etymology = None
definitions = {} definitions = {}
for line in bytes.splitlines(): for line in result.splitlines():
if 'id="Etymology"' in line: if line == '===Etymology===':
mode = 'etymology' mode = 'etymology'
elif 'id="Noun"' in line: elif 'Noun' in line:
mode = 'noun' mode = 'noun'
elif 'id="Verb"' in line: elif 'Verb' in line:
mode = 'verb' mode = 'verb'
elif 'id="Adjective"' in line: elif 'Adjective' in line:
mode = 'adjective' mode = 'adjective'
elif 'id="Adverb"' in line: elif 'Adverb' in line:
mode = 'adverb' mode = 'adverb'
elif 'id="Interjection"' in line: elif 'Interjection' in line:
mode = 'interjection' mode = 'interjection'
elif 'id="Particle"' in line: elif 'Particle' in line:
mode = 'particle' mode = 'particle'
elif 'id="Preposition"' in line: elif 'Preposition' in line:
mode = 'preposition' mode = 'preposition'
elif 'id="' in line: elif len(line) == 0:
mode = None mode = None
elif (mode == 'etmyology') and ('<p>' in line): elif mode == 'etymology':
etymology = text(line) etymology = text(line)
elif (mode is not None) and ('<li>' in line): elif mode is not None and '#' in line:
definitions.setdefault(mode, []).append(text(line)) definitions.setdefault(mode, []).append(text(line))
if '<hr' in line: if '====Synonyms====' in line:
break break
return etymology, definitions return etymology, definitions
@ -92,9 +106,5 @@ def w(phenny, input):
w.commands = ['w'] w.commands = ['w']
w.example = '.w bailiwick' w.example = '.w bailiwick'
def encarta(phenny, input):
return phenny.reply('Microsoft removed Encarta, try .w instead!')
encarta.commands = ['dict']
if __name__ == '__main__': if __name__ == '__main__':
print(__doc__.strip()) print(__doc__.strip())