phenny-1/modules/head.py

198 lines
5.4 KiB
Python
Raw Normal View History

#!/usr/bin/env python
"""
head.py - Phenny HTTP Metadata Utilities
Copyright 2008, Sean B. Palmer, inamidst.com
Licensed under the Eiffel Forum License 2.
http://inamidst.com/phenny/
"""
2011-12-28 17:45:11 -05:00
import re
2013-06-09 01:27:24 -04:00
#import urllib.request
2011-12-28 17:45:11 -05:00
import urllib.parse
2013-06-09 01:27:24 -04:00
#import http.client
#import http.cookiejar
2011-12-28 17:45:11 -05:00
import time
from html.entities import name2codepoint
import web
from tools import deprecated
from modules.linx import get_title as linx_gettitle
2013-06-09 01:27:24 -04:00
#cj = http.cookiejar.LWPCookieJar()
#opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
#urllib.request.install_opener(opener)
def head(phenny, input):
2011-12-28 17:45:11 -05:00
"""Provide HTTP HEAD information."""
uri = input.group(2)
uri = (uri or '')
if ' ' in uri:
2011-12-28 17:45:11 -05:00
uri, header = uri.rsplit(' ', 1)
else:
uri, header = uri, None
2011-12-28 17:45:11 -05:00
if not uri and hasattr(phenny, 'last_seen_uri'):
try:
uri = phenny.last_seen_uri[input.sender]
except KeyError:
return phenny.say('?')
2011-12-28 17:45:11 -05:00
if not uri.startswith('htt'):
2011-12-28 17:45:11 -05:00
uri = 'http://' + uri
# uri = uri.replace('#!', '?_escaped_fragment_=')
start = time.time()
try:
info = web.head(uri)
info['status'] = '200'
2013-06-09 01:27:24 -04:00
except web.HTTPError as e:
2011-12-28 17:45:11 -05:00
return phenny.say(str(e.code))
2013-06-09 01:27:24 -04:00
except web.ConnectionError:
2011-12-28 17:45:11 -05:00
return phenny.say("Can't connect to %s" % uri)
resptime = time.time() - start
if header is None:
2011-12-28 17:45:11 -05:00
data = []
if 'Status' in info:
2011-12-28 17:45:11 -05:00
data.append(info['Status'])
if 'content-type' in info:
2011-12-28 17:45:11 -05:00
data.append(info['content-type'].replace('; charset=', ', '))
if 'last-modified' in info:
2011-12-28 17:45:11 -05:00
modified = info['last-modified']
modified = time.strptime(modified, '%a, %d %b %Y %H:%M:%S %Z')
data.append(time.strftime('%Y-%m-%d %H:%M:%S UTC', modified))
if 'content-length' in info:
2011-12-28 17:45:11 -05:00
data.append(info['content-length'] + ' bytes')
data.append('{0:1.2f} s'.format(resptime))
phenny.reply(', '.join(data))
else:
2011-12-28 17:45:11 -05:00
headerlower = header.lower()
if headerlower in info:
2011-12-28 17:45:11 -05:00
phenny.say(header + ': ' + info.get(headerlower))
else:
2011-12-28 17:45:11 -05:00
msg = 'There was no %s header in the response.' % header
phenny.say(msg)
head.commands = ['head']
head.example = '.head http://www.w3.org/'
2012-06-27 19:33:09 -04:00
r_title = re.compile(r'(?ims)<title[^>]*>(.*?)</title\s*>')
r_entity = re.compile(r'&[A-Za-z0-9#]+;')
def noteuri(phenny, input):
2011-12-28 17:45:11 -05:00
uri = input.group(1)
if not hasattr(phenny.bot, 'last_seen_uri'):
2011-12-28 17:45:11 -05:00
phenny.bot.last_seen_uri = {}
phenny.bot.last_seen_uri[input.sender] = uri
noteuri.rule = r'.*(http[s]?://[^<> "\x01]+)[,.]?'
noteuri.priority = 'low'
def snarfuri(phenny, input):
2011-12-28 17:45:11 -05:00
uri = input.group(1)
if phenny.config.linx_api_key != "":
title = linx_gettitle(phenny, uri, input.sender)
else:
title = gettitle(phenny, uri)
2011-12-28 17:45:11 -05:00
if title:
2012-06-27 19:33:09 -04:00
phenny.msg(input.sender, title)
snarfuri.rule = r'.*(http[s]?://[^<> "\x01]+)[,.]?'
snarfuri.priority = 'low'
2012-09-24 23:03:15 -04:00
snarfuri.thread = True
def gettitle(phenny, uri):
if not ':' in uri:
uri = 'http://' + uri
uri = uri.replace('#!', '?_escaped_fragment_=')
title = None
localhost = [
'http://localhost/', 'http://localhost:80/',
'http://localhost:8080/', 'http://127.0.0.1/',
'http://127.0.0.1:80/', 'http://127.0.0.1:8080/',
'https://localhost/', 'https://localhost:80/',
'https://localhost:8080/', 'https://127.0.0.1/',
'https://127.0.0.1:80/', 'https://127.0.0.1:8080/',
]
for s in localhost:
if uri.startswith(s):
return phenny.reply('Sorry, access forbidden.')
try:
redirects = 0
while True:
info = web.head(uri)
if not isinstance(info, list):
status = '200'
else:
status = str(info[1])
info = info[0]
if status.startswith('3'):
uri = urllib.parse.urljoin(uri, info['Location'])
else:
break
redirects += 1
if redirects >= 25:
return None
try:
mtype = info['content-type']
except:
return None
if not (('/html' in mtype) or ('/xhtml' in mtype)):
return None
bytes = web.get(uri)
#bytes = u.read(262144)
#u.close()
2013-06-09 01:27:24 -04:00
except web.ConnectionError:
return
m = r_title.search(bytes)
if m:
title = m.group(1)
title = title.strip()
title = title.replace('\t', ' ')
title = title.replace('\r', ' ')
title = title.replace('\n', ' ')
while ' ' in title:
title = title.replace(' ', ' ')
if len(title) > 200:
title = title[:200] + '[...]'
def e(m):
entity = m.group(0)
if entity.startswith('&#x'):
cp = int(entity[3:-1], 16)
return chr(cp)
elif entity.startswith('&#'):
cp = int(entity[2:-1])
return chr(cp)
else:
char = name2codepoint[entity[1:-1]]
return chr(char)
title = r_entity.sub(e, title)
if title:
title = title.replace('\n', '')
title = title.replace('\r', '')
title = "[ {0} ]".format(title)
else:
title = None
return title
if __name__ == '__main__':
2011-12-28 17:45:11 -05:00
print(__doc__.strip())