xkcd webcomic grabber

  1 #!/usr/bin/env python
  2 # -*- coding: utf-8
  3 #
  4 #    grabxkcd.py - A small script to keep up to date with all xkcd
  5 #                  webcomics. Downloads all up to the current one
  6 #                  into current folder.
  7 #
  8 #    Copyright (C) 2007 Stefan Hacker
  9 #
 10 #    This program is free software; you can redistribute it and/or modify
 11 #    it under the terms of the GNU General Public License as published by
 12 #    the Free Software Foundation; either version 2 of the License, or
 13 #    (at your option) any later version.
 14 #
 15 #    This program is distributed in the hope that it will be useful,
 16 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 18 #    GNU General Public License for more details.
 19 #
 20 #    You should have received a copy of the GNU General Public License along
 21 #    with this program; if not, write to the Free Software Foundation, Inc.,
 22 #    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 23 
 24 from urlgrabber import (urlread,
 25                         urlgrab)
 26 import os
 27 
 28 #Target directory
 29 target_dir  = ""
 30 #Status file (relative to target dir)
 31 status_file = "xkcd.txt"
 32 
 33 def extract_string(string, prestring, poststring):
 34     start = string.index(prestring) + len(prestring)
 35     end = string.index(poststring, start)
 36     return string[start:end]
 37 
 38 def update_xkcd():
 39     print "Load status...",
 40     try:
 41         status = open(target_dir+status_file, "r")
 42         nfrom = int(status.read())
 43         status.close()
 44         print "Done (%d)" % nfrom
 45     except:
 46         print "Failed (1)"
 47         nfrom = 1
 48 
 49     #Get range of comics
 50     print "Getting current comic number..."
 51     try:
 52         page = urlread("http://xkcd.com/")
 53         nto = int(extract_string(page,
 54                        "<h3>Permanent link to this comic: http://xkcd.com/",
 55                        "/</h3>"))
 56         print "Done (%d)" % nto
 57     except:
 58         print "Failed"
 59         nto = int(raw_input("Load upto number: "))
 60 
 61     if nfrom > nto:
 62         print "Nothing to do! Already got all comics!"
 63         return []
 64 
 65     result = grab_xkcd(nfrom, nto)
 66     if result:
 67         #Open status file and save position
 68         print "Save status...",
 69         status = open(target_dir+status_file, "w")
 70         status.write("%d" % result[-1])
 71         status.close()
 72         print "Done (%d)" % result[-1]
 73 
 74     return result    
 75 
 76 
 77 def grab_xkcd(nfrom, nto = None):
 78     i=0
 79     nto = nto or (nfrom + 1)
 80     try:
 81         for i in xrange(nfrom, nto+1):
 82             if i == 404:
 83                 print "Skipping 404 error page!"
 84                 continue
 85             print "Retrieving %d..." % i,
 86             page = urlread("http://xkcd.com/%d/" % i)
 87             image = extract_string(page,
 88                            "<h3>Image URL (for hotlinking/embedding): ",
 89                            "</h3>")
 90             filepath = image[len("http://imgs.xkcd.com/comics/"):]
 91             filepath = "%s%.4d_%s" % (target_dir, i, filepath)
 92             
 93             ipath = urlgrab(image, filepath)
 94             
 95             print "got %s" % ipath
 96     except:
 97         print "Error"
 98         print "An error occured while loading comic number: %d" % i
 99     else:
100         i += 1
101 
102     return range(nfrom, i + 1)
103         
104 
105 def main():
106     update_xkcd()
107     
108 if __name__ == "__main__":
109     main()
xkcd webcomic grabber

grabxkcd.py (Raw)