added uri snarfing with automatic title reading

2011-03-05 19:51:52 -06:00
parent 18a24a8117
commit ff2434db41
1 changed files with 34 additions and 24 deletions
@@ -9,6 +9,7 @@ http://inamidst.com/phenny/

 import re, urllib, urllib2, httplib, urlparse, time, cookielib
 from htmlentitydefs import name2codepoint
+from string import join
 import web
 from tools import deprecated

@@ -82,7 +83,32 @@ def f_title(self, origin, match, args):
      uri = self.last_seen_uri.get(origin.sender)
   if not uri: 
      return self.msg(origin.sender, 'I need a URI to give the title of...')
+   title = gettitle(uri)
+   if title:
+      self.msg(origin.sender, origin.nick + ': ' + title)
+   else: self.msg(origin.sender, origin.nick + ': No title found')
+f_title.commands = ['title']

+def noteuri(phenny, input): 
+   uri = input.group(1).encode('utf-8')
+   if not hasattr(phenny.bot, 'last_seen_uri'): 
+      phenny.bot.last_seen_uri = {}
+   phenny.bot.last_seen_uri[input.sender] = uri
+noteuri.rule = r'.*(http[s]?://[^<> "\x01]+)[,.]?'
+noteuri.priority = 'low'
+
+titlecommands = r'(?:' + join(f_title.commands, r'|') + r')'
+def snarfuri(phenny, input):
+   if re.match(r'(?i)' + phenny.config.prefix + titlecommands, input.group()):
+      return
+   uri = input.group(1).encode('utf-8')
+   title = gettitle(uri)
+   if title:
+      phenny.msg(input.sender, '[ ' + title + ' ]')
+snarfuri.rule = r'.*(http[s]?://[^<> "\x01]+)[,.]?'
+snarfuri.priority = 'low'
+
+def gettitle(uri):
   if not ':' in uri: 
      uri = 'http://' + uri
   uri = uri.replace('#!', '?_escaped_fragment_=')
@@ -98,7 +124,6 @@ def f_title(self, origin, match, args):
         u = urllib2.urlopen(req)
         info = u.info()
         u.close()
-         # info = web.head(uri)

         if not isinstance(info, list): 
            status = '200'
@@ -111,23 +136,19 @@ def f_title(self, origin, match, args):

         redirects += 1
         if redirects >= 25: 
-            self.msg(origin.sender, origin.nick + ": Too many redirects")
-            return
+            return None

      try: mtype = info['content-type']
      except: 
-         err = ": Couldn't get the Content-Type, sorry"
-         return self.msg(origin.sender, origin.nick + err)
+         return None
         if not (('/html' in mtype) or ('/xhtml' in mtype)): 
-         self.msg(origin.sender, origin.nick + ": Document isn't HTML")
-         return
+            return None

      u = urllib2.urlopen(req)
      bytes = u.read(262144)
      u.close()

   except IOError: 
-      self.msg(origin.sender, "Can't connect to %s" % uri)
      return

   m = r_title.search(bytes)
@@ -161,21 +182,10 @@ def f_title(self, origin, match, args):
            try: title = title.decode('iso-8859-1').encode('utf-8')
            except: title = title.decode('cp1252').encode('utf-8')
         else: pass
-      else: title = '[The title is empty.]'
-
         title = title.replace('\n', '')
         title = title.replace('\r', '')
-      self.msg(origin.sender, origin.nick + ': ' + title)
-   else: self.msg(origin.sender, origin.nick + ': No title found')
-f_title.commands = ['title']
-
-def noteuri(phenny, input): 
-   uri = input.group(1).encode('utf-8')
-   if not hasattr(phenny.bot, 'last_seen_uri'): 
-      phenny.bot.last_seen_uri = {}
-   phenny.bot.last_seen_uri[input.sender] = uri
-noteuri.rule = r'.*(http[s]?://[^<> "\x01]+)[,.]?'
-noteuri.priority = 'low'
+      else: title = None
+   return title

 if __name__ == '__main__': 
   print __doc__.strip()