added uri snarfing with automatic title reading

master
David Moore 2011-03-05 19:51:52 -06:00
parent 18a24a8117
commit ff2434db41
1 changed files with 34 additions and 24 deletions

View File

@ -9,6 +9,7 @@ http://inamidst.com/phenny/
import re, urllib, urllib2, httplib, urlparse, time, cookielib import re, urllib, urllib2, httplib, urlparse, time, cookielib
from htmlentitydefs import name2codepoint from htmlentitydefs import name2codepoint
from string import join
import web import web
from tools import deprecated from tools import deprecated
@ -82,7 +83,32 @@ def f_title(self, origin, match, args):
uri = self.last_seen_uri.get(origin.sender) uri = self.last_seen_uri.get(origin.sender)
if not uri: if not uri:
return self.msg(origin.sender, 'I need a URI to give the title of...') return self.msg(origin.sender, 'I need a URI to give the title of...')
title = gettitle(uri)
if title:
self.msg(origin.sender, origin.nick + ': ' + title)
else: self.msg(origin.sender, origin.nick + ': No title found')
f_title.commands = ['title']
def noteuri(phenny, input):
uri = input.group(1).encode('utf-8')
if not hasattr(phenny.bot, 'last_seen_uri'):
phenny.bot.last_seen_uri = {}
phenny.bot.last_seen_uri[input.sender] = uri
noteuri.rule = r'.*(http[s]?://[^<> "\x01]+)[,.]?'
noteuri.priority = 'low'
titlecommands = r'(?:' + join(f_title.commands, r'|') + r')'
def snarfuri(phenny, input):
if re.match(r'(?i)' + phenny.config.prefix + titlecommands, input.group()):
return
uri = input.group(1).encode('utf-8')
title = gettitle(uri)
if title:
phenny.msg(input.sender, '[ ' + title + ' ]')
snarfuri.rule = r'.*(http[s]?://[^<> "\x01]+)[,.]?'
snarfuri.priority = 'low'
def gettitle(uri):
if not ':' in uri: if not ':' in uri:
uri = 'http://' + uri uri = 'http://' + uri
uri = uri.replace('#!', '?_escaped_fragment_=') uri = uri.replace('#!', '?_escaped_fragment_=')
@ -98,7 +124,6 @@ def f_title(self, origin, match, args):
u = urllib2.urlopen(req) u = urllib2.urlopen(req)
info = u.info() info = u.info()
u.close() u.close()
# info = web.head(uri)
if not isinstance(info, list): if not isinstance(info, list):
status = '200' status = '200'
@ -111,23 +136,19 @@ def f_title(self, origin, match, args):
redirects += 1 redirects += 1
if redirects >= 25: if redirects >= 25:
self.msg(origin.sender, origin.nick + ": Too many redirects") return None
return
try: mtype = info['content-type'] try: mtype = info['content-type']
except: except:
err = ": Couldn't get the Content-Type, sorry" return None
return self.msg(origin.sender, origin.nick + err) if not (('/html' in mtype) or ('/xhtml' in mtype)):
if not (('/html' in mtype) or ('/xhtml' in mtype)): return None
self.msg(origin.sender, origin.nick + ": Document isn't HTML")
return
u = urllib2.urlopen(req) u = urllib2.urlopen(req)
bytes = u.read(262144) bytes = u.read(262144)
u.close() u.close()
except IOError: except IOError:
self.msg(origin.sender, "Can't connect to %s" % uri)
return return
m = r_title.search(bytes) m = r_title.search(bytes)
@ -161,21 +182,10 @@ def f_title(self, origin, match, args):
try: title = title.decode('iso-8859-1').encode('utf-8') try: title = title.decode('iso-8859-1').encode('utf-8')
except: title = title.decode('cp1252').encode('utf-8') except: title = title.decode('cp1252').encode('utf-8')
else: pass else: pass
else: title = '[The title is empty.]' title = title.replace('\n', '')
title = title.replace('\r', '')
title = title.replace('\n', '') else: title = None
title = title.replace('\r', '') return title
self.msg(origin.sender, origin.nick + ': ' + title)
else: self.msg(origin.sender, origin.nick + ': No title found')
f_title.commands = ['title']
def noteuri(phenny, input):
uri = input.group(1).encode('utf-8')
if not hasattr(phenny.bot, 'last_seen_uri'):
phenny.bot.last_seen_uri = {}
phenny.bot.last_seen_uri[input.sender] = uri
noteuri.rule = r'.*(http[s]?://[^<> "\x01]+)[,.]?'
noteuri.priority = 'low'
if __name__ == '__main__': if __name__ == '__main__':
print __doc__.strip() print __doc__.strip()