phenny/modules/head.py

#!/usr/bin/env python
"""
head.py - Phenny HTTP Metadata Utilities
Copyright 2008, Sean B. Palmer, inamidst.com
Licensed under the Eiffel Forum License 2.

http://inamidst.com/phenny/
"""

import re, urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse, http.client, urllib.parse, time, http.cookiejar
from html.entities import name2codepoint
from string import join
import web
from tools import deprecated

cj = http.cookiejar.LWPCookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
urllib.request.install_opener(opener)

def head(phenny, input):
   """Provide HTTP HEAD information."""
   uri = input.group(2)
   uri = (uri or '')
   if ' ' in uri:
      uri, header = uri.rsplit(' ', 1)
   else: uri, header = uri, None

   if not uri and hasattr(phenny, 'last_seen_uri'):
      try: uri = phenny.last_seen_uri[input.sender]
      except KeyError: return phenny.say('?')

   if not uri.startswith('htt'):
      uri = 'http://' + uri
   # uri = uri.replace('#!', '?_escaped_fragment_=')

   try: info = web.head(uri)
   except IOError: return phenny.say("Can't connect to %s" % uri)
   except http.client.InvalidURL: return phenny.say("Not a valid URI, sorry.")

   if not isinstance(info, list):
      try: info = dict(info)
      except TypeError:
         return phenny.reply('Try .head http://example.org/ [optional header]')
      info['Status'] = '200'
   else:
      newInfo = dict(info[0])
      newInfo['Status'] = str(info[1])
      info = newInfo

   if header is None:
      data = []
      if 'Status' in info:
         data.append(info['Status'])
      if 'content-type' in info:
         data.append(info['content-type'].replace('; charset=', ', '))
      if 'last-modified' in info:
         modified = info['last-modified']
         modified = time.strptime(modified, '%a, %d %b %Y %H:%M:%S %Z')
         data.append(time.strftime('%Y-%m-%d %H:%M:%S UTC', modified))
      if 'content-length' in info:
         data.append(info['content-length'] + ' bytes')
      phenny.reply(', '.join(data))
   else:
      headerlower = header.lower()
      if headerlower in info:
         phenny.say(header + ': ' + info.get(headerlower))
      else:
         msg = 'There was no %s header in the response.' % header
         phenny.say(msg)
head.commands = ['head']
head.example = '.head http://www.w3.org/'

r_title = re.compile(r'(?ims)<title[^>]*>(.*?)</title\s*>')
r_entity = re.compile(r'&[A-Za-z0-9#]+;')

@deprecated
def f_title(self, origin, match, args):
   """.title <URI> - Return the title of URI."""
   uri = match.group(2)
   uri = (uri or '')

   if not uri and hasattr(self, 'last_seen_uri'):
      uri = self.last_seen_uri.get(origin.sender)
   if not uri:
      return self.msg(origin.sender, 'I need a URI to give the title of...')
   title = gettitle(uri)
   if title:
      self.msg(origin.sender, origin.nick + ': ' + title)
   else: self.msg(origin.sender, origin.nick + ': No title found')
f_title.commands = ['title']

def noteuri(phenny, input):
   uri = input.group(1)
   if not hasattr(phenny.bot, 'last_seen_uri'):
      phenny.bot.last_seen_uri = {}
   phenny.bot.last_seen_uri[input.sender] = uri
noteuri.rule = r'.*(http[s]?://[^<> "\x01]+)[,.]?'
noteuri.priority = 'low'

titlecommands = r'(?:' + join(f_title.commands, r'|') + r')'
def snarfuri(phenny, input):
   if re.match(r'(?i)' + phenny.config.prefix + titlecommands, input.group()):
      return
   uri = input.group(1)
   title = gettitle(uri)
   if title:
      phenny.msg(input.sender, '[ ' + title + ' ]')
snarfuri.rule = r'.*(http[s]?://[^<> "\x01]+)[,.]?'
snarfuri.priority = 'low'

def gettitle(uri):
   if not ':' in uri:
      uri = 'http://' + uri
   uri = uri.replace('#!', '?_escaped_fragment_=')

   title = None
   localhost = [
      'http://localhost/', 'http://localhost:80/',
      'http://localhost:8080/', 'http://127.0.0.1/',
      'http://127.0.0.1:80/', 'http://127.0.0.1:8080/',
      'https://localhost/', 'https://localhost:80/',
      'https://localhost:8080/', 'https://127.0.0.1/',
      'https://127.0.0.1:80/', 'https://127.0.0.1:8080/',
   ]
   for s in localhost:
      if uri.startswith(s):
         return phenny.reply('Sorry, access forbidden.')

   try:
      redirects = 0
      while True:
         info = web.head(uri)

         if not isinstance(info, list):
            status = '200'
         else:
            status = str(info[1])
            info = info[0]
         if status.startswith('3'):
            uri = urllib.parse.urljoin(uri, info['Location'])
         else: break

         redirects += 1
         if redirects >= 25:
            return None

      try: mtype = info['content-type']
      except:
         return None
         if not (('/html' in mtype) or ('/xhtml' in mtype)):
            return None

      bytes = web.get(uri)
      #bytes = u.read(262144)
      #u.close()

   except IOError:
      return

   m = r_title.search(bytes)
   if m:
      title = m.group(1)
      title = title.strip()
      title = title.replace('\t', ' ')
      title = title.replace('\r', ' ')
      title = title.replace('\n', ' ')
      while '  ' in title:
         title = title.replace('  ', ' ')
      if len(title) > 200:
         title = title[:200] + '[...]'

      def e(m):
         entity = m.group(0)
         if entity.startswith('&#x'):
            cp = int(entity[3:-1], 16)
            return chr(cp)
         elif entity.startswith('&#'):
            cp = int(entity[2:-1])
            return chr(cp)
         else:
            char = name2codepoint[entity[1:-1]]
            return chr(char)
      title = r_entity.sub(e, title)

      if title:
         try: title.decode('utf-8')
         except:
            try: title = title.decode('iso-8859-1')
            except: title = title.decode('cp1252')
         else: pass
         title = title.replace('\n', '')
         title = title.replace('\r', '')
      else: title = None
   return title

if __name__ == '__main__':
   print(__doc__.strip())