phenny/modules/head.py

#!/usr/bin/env python
"""
head.py - Phenny HTTP Metadata Utilities
Copyright 2008, Sean B. Palmer, inamidst.com
Licensed under the Eiffel Forum License 2.

http://inamidst.com/phenny/
"""

import re, urllib, urlparse, time
from htmlentitydefs import name2codepoint
import web
from tools import deprecated

def head(phenny, input): 
   """Provide HTTP HEAD information."""
   uri = input.group(2)
   uri = (uri or '').encode('utf-8')
   if ' ' in uri: 
      uri, header = uri.rsplit(' ', 1)
   else: uri, header = uri, None

   if not uri and hasattr(phenny, 'last_seen_uri'): 
      try: uri = phenny.last_seen_uri[input.sender]
      except KeyError: return phenny.say('?')

   try: info = web.head(uri)
   except IOError: return phenny.say("Can't connect to %s" % uri)

   if not isinstance(info, list): 
      info = dict(info)
      info['Status'] = '200'
   else: 
      newInfo = dict(info[0])
      newInfo['Status'] = str(info[1])
      info = newInfo

   if header is None: 
      data = []
      if info.has_key('Status'): 
         data.append(info['Status'])
      if info.has_key('content-type'): 
         data.append(info['content-type'].replace('; charset=', ', '))
      if info.has_key('last-modified'): 
         modified = info['last-modified']
         modified = time.strptime(modified, '%a, %d %b %Y %H:%M:%S %Z')
         data.append(time.strftime('%Y-%m-%d %H:%M:%S UTC', modified))
      if info.has_key('content-length'): 
         data.append(info['content-length'] + ' bytes')
      phenny.reply(', '.join(data))
   else: 
      headerlower = header.lower()
      if info.has_key(headerlower): 
         phenny.say(header + ': ' + info.get(headerlower))
      else: 
         msg = 'There was no %s header in the response.' % header
         phenny.say(msg)
head.commands = ['head']
head.example = '.head http://www.w3.org/'

r_title = re.compile(r'(?ims)<title[^>]*>(.*?)</title\s*>')
r_entity = re.compile(r'&[A-Za-z0-9#]+;')

@deprecated
def f_title(self, origin, match, args): 
   """.title <URI> - Return the title of URI."""
   uri = match.group(2)
   uri = (uri or '').encode('utf-8')

   if not uri and hasattr(self, 'last_seen_uri'): 
      uri = self.last_seen_uri.get(origin.sender)
   if not uri: 
      return self.msg(origin.sender, 'I need a URI to give the title of...')

   if not ':' in uri: 
      uri = 'http://' + uri

   try: 
      redirects = 0
      while True: 
         info = web.head(uri)

         if not isinstance(info, list): 
            status = '200'
         else: 
            status = str(info[1])
            info = info[0]
         if status.startswith('3'): 
            uri = urlparse.urljoin(uri, info['Location'])
         else: break

         redirects += 1
         if redirects >= 25: 
            self.msg(origin.sender, origin.nick + ": Too many redirects")
            return

      try: mtype = info['content-type']
      except: 
         err = ": Couldn't get the Content-Type, sorry"
         return self.msg(origin.sender, origin.nick + err)
      if not (('/html' in mtype) or ('/xhtml' in mtype)): 
         self.msg(origin.sender, origin.nick + ": Document isn't HTML")
         return

      u = urllib.urlopen(uri)
      bytes = u.read(32768)
      u.close()

   except IOError: 
      self.msg(origin.sender, "Can't connect to %s" % uri)
      return

   m = r_title.search(bytes)
   if m: 
      title = m.group(1)
      title = title.strip()
      title = title.replace('\t', ' ')
      title = title.replace('\r', ' ')
      title = title.replace('\n', ' ')
      while '  ' in title: 
         title = title.replace('  ', ' ')
      if len(title) > 200: 
         title = title[:200] + '[...]'
      
      def e(m): 
         entity = m.group(0)
         if entity.startswith('&#x'): 
            cp = int(entity[3:-1], 16)
            return unichr(cp).encode('utf-8')
         elif entity.startswith('&#'): 
            cp = int(entity[2:-1])
            return unichr(cp).encode('utf-8')
         else: 
            char = name2codepoint[entity[1:-1]]
            return unichr(char).encode('utf-8')
      title = r_entity.sub(e, title)

      if not title: 
         title = '[Title is the empty document, "".]'
      self.msg(origin.sender, origin.nick + ': ' + title)
   else: self.msg(origin.sender, origin.nick + ': No title found')
f_title.commands = ['title']

def noteuri(phenny, input): 
   uri = input.group(1).encode('utf-8')
   if not hasattr(phenny.bot, 'last_seen_uri'): 
      phenny.bot.last_seen_uri = {}
   phenny.bot.last_seen_uri[input.sender] = uri
noteuri.rule = r'.*(http://[^<> "\x01]+)[,.]?'
noteuri.priority = 'low'

if __name__ == '__main__': 
   print __doc__.strip()
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`#!/usr/bin/env python`
			`"""`
			`head.py - Phenny HTTP Metadata Utilities`
			`Copyright 2008, Sean B. Palmer, inamidst.com`
			`Licensed under the Eiffel Forum License 2.`

			`http://inamidst.com/phenny/`
			`"""`

Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00			`import re, urllib, urlparse, time`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`from htmlentitydefs import name2codepoint`
			`import web`
			`from tools import deprecated`

Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00			`def head(phenny, input):`
			`"""Provide HTTP HEAD information."""`
			`uri = input.group(2)`
			`uri = (uri or '').encode('utf-8')`
			`if ' ' in uri:`
			`uri, header = uri.rsplit(' ', 1)`
			`else: uri, header = uri, None`

			`if not uri and hasattr(phenny, 'last_seen_uri'):`
New "limit" config variable, and some module fixes. 2008-03-31 11:17:32 -04:00			`try: uri = phenny.last_seen_uri[input.sender]`
			`except KeyError: return phenny.say('?')`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00
			`try: info = web.head(uri)`
Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00			`except IOError: return phenny.say("Can't connect to %s" % uri)`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00
			`if not isinstance(info, list):`
			`info = dict(info)`
			`info['Status'] = '200'`
			`else:`
			`newInfo = dict(info[0])`
			`newInfo['Status'] = str(info[1])`
			`info = newInfo`

			`if header is None:`
Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00			`data = []`
			`if info.has_key('Status'):`
			`data.append(info['Status'])`
			`if info.has_key('content-type'):`
			`data.append(info['content-type'].replace('; charset=', ', '))`
			`if info.has_key('last-modified'):`
			`modified = info['last-modified']`
			`modified = time.strptime(modified, '%a, %d %b %Y %H:%M:%S %Z')`
			`data.append(time.strftime('%Y-%m-%d %H:%M:%S UTC', modified))`
			`if info.has_key('content-length'):`
			`data.append(info['content-length'] + ' bytes')`
			`phenny.reply(', '.join(data))`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`else:`
			`headerlower = header.lower()`
			`if info.has_key(headerlower):`
Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00			`phenny.say(header + ': ' + info.get(headerlower))`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`else:`
			`msg = 'There was no %s header in the response.' % header`
Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00			`phenny.say(msg)`
			`head.commands = ['head']`
			`head.example = '.head http://www.w3.org/'`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00
			`r_title = re.compile(r'(?ims)<title[^>]>(.?)</title\s*>')`
			`r_entity = re.compile(r'&[A-Za-z0-9#]+;')`

			`@deprecated`
			`def f_title(self, origin, match, args):`
			`""".title <URI> - Return the title of URI."""`
			`uri = match.group(2)`
Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00			`uri = (uri or '').encode('utf-8')`

			`if not uri and hasattr(self, 'last_seen_uri'):`
New "limit" config variable, and some module fixes. 2008-03-31 11:17:32 -04:00			`uri = self.last_seen_uri.get(origin.sender)`
Some documentation and minor fixes. 2008-03-10 15:58:28 -04:00			`if not uri:`
New "limit" config variable, and some module fixes. 2008-03-31 11:17:32 -04:00			`return self.msg(origin.sender, 'I need a URI to give the title of...')`
Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`if not ':' in uri:`
			`uri = 'http://' + uri`

			`try:`
			`redirects = 0`
			`while True:`
			`info = web.head(uri)`

			`if not isinstance(info, list):`
			`status = '200'`
			`else:`
			`status = str(info[1])`
			`info = info[0]`
			`if status.startswith('3'):`
Lots of fixes, changes, and new goodies. 2008-02-23 07:16:43 -05:00			`uri = urlparse.urljoin(uri, info['Location'])`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`else: break`

			`redirects += 1`
			`if redirects >= 25:`
			`self.msg(origin.sender, origin.nick + ": Too many redirects")`
			`return`

Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00			`try: mtype = info['content-type']`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`except:`
Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00			`err = ": Couldn't get the Content-Type, sorry"`
			`return self.msg(origin.sender, origin.nick + err)`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00			`if not (('/html' in mtype) or ('/xhtml' in mtype)):`
			`self.msg(origin.sender, origin.nick + ": Document isn't HTML")`
			`return`

			`u = urllib.urlopen(uri)`
			`bytes = u.read(32768)`
			`u.close()`

			`except IOError:`
			`self.msg(origin.sender, "Can't connect to %s" % uri)`
			`return`

			`m = r_title.search(bytes)`
			`if m:`
			`title = m.group(1)`
			`title = title.strip()`
			`title = title.replace('\t', ' ')`
			`title = title.replace('\r', ' ')`
			`title = title.replace('\n', ' ')`
			`while ' ' in title:`
			`title = title.replace(' ', ' ')`
			`if len(title) > 200:`
			`title = title[:200] + '[...]'`

			`def e(m):`
			`entity = m.group(0)`
			`if entity.startswith('&#x'):`
			`cp = int(entity[3:-1], 16)`
			`return unichr(cp).encode('utf-8')`
			`elif entity.startswith('&#'):`
			`cp = int(entity[2:-1])`
			`return unichr(cp).encode('utf-8')`
			`else:`
			`char = name2codepoint[entity[1:-1]]`
			`return unichr(char).encode('utf-8')`
			`title = r_entity.sub(e, title)`

			`if not title:`
			`title = '[Title is the empty document, "".]'`
			`self.msg(origin.sender, origin.nick + ': ' + title)`
			`else: self.msg(origin.sender, origin.nick + ': No title found')`
Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00			`f_title.commands = ['title']`

			`def noteuri(phenny, input):`
			`uri = input.group(1).encode('utf-8')`
Some documentation and minor fixes. 2008-03-10 15:58:28 -04:00			`if not hasattr(phenny.bot, 'last_seen_uri'):`
			`phenny.bot.last_seen_uri = {}`
			`phenny.bot.last_seen_uri[input.sender] = uri`
New "limit" config variable, and some module fixes. 2008-03-31 11:17:32 -04:00			`noteuri.rule = r'.*(http://[^<> "\x01]+)[,.]?'`
Some more little fixes, and added a Makefile. 2008-02-29 10:36:18 -05:00			`noteuri.priority = 'low'`
Phenny2, now being tested on Freenode as the main phenny. 2008-02-21 07:06:33 -05:00
			`if __name__ == '__main__':`
Better __doc__ consistency; really a test of some mercurial issues. 2008-03-02 09:44:14 -05:00			`print __doc__.strip()`