1 #!/usr/bin/env python
2 # -*- coding: utf-8
3 #
4 # grabxkcd.py - A small script to keep up to date with all xkcd
5 # webcomics. Downloads all up to the current one
6 # into current folder.
7 #
8 # Copyright (C) 2007 Stefan Hacker
9 #
10 # This program is free software; you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License as published by
12 # the Free Software Foundation; either version 2 of the License, or
13 # (at your option) any later version.
14 #
15 # This program is distributed in the hope that it will be useful,
16 # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 # GNU General Public License for more details.
19 #
20 # You should have received a copy of the GNU General Public License along
21 # with this program; if not, write to the Free Software Foundation, Inc.,
22 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23
24 from urlgrabber import (urlread,
25 urlgrab)
26 import os
27
28 #Target directory
29 target_dir = ""
30 #Status file (relative to target dir)
31 status_file = "xkcd.txt"
32
33 def extract_string(string, prestring, poststring):
34 start = string.index(prestring) + len(prestring)
35 end = string.index(poststring, start)
36 return string[start:end]
37
38 def update_xkcd():
39 print "Load status...",
40 try:
41 status = open(target_dir+status_file, "r")
42 nfrom = int(status.read())
43 status.close()
44 print "Done (%d)" % nfrom
45 except:
46 print "Failed (1)"
47 nfrom = 1
48
49 #Get range of comics
50 print "Getting current comic number..."
51 try:
52 page = urlread("http://xkcd.com/")
53 nto = int(extract_string(page,
54 "<h3>Permanent link to this comic: http://xkcd.com/",
55 "/</h3>"))
56 print "Done (%d)" % nto
57 except:
58 print "Failed"
59 nto = int(raw_input("Load upto number: "))
60
61 if nfrom > nto:
62 print "Nothing to do! Already got all comics!"
63 return []
64
65 result = grab_xkcd(nfrom, nto)
66 if result:
67 #Open status file and save position
68 print "Save status...",
69 status = open(target_dir+status_file, "w")
70 status.write("%d" % result[-1])
71 status.close()
72 print "Done (%d)" % result[-1]
73
74 return result
75
76
77 def grab_xkcd(nfrom, nto = None):
78 i=0
79 nto = nto or (nfrom + 1)
80 try:
81 for i in xrange(nfrom, nto+1):
82 if i == 404:
83 print "Skipping 404 error page!"
84 continue
85 print "Retrieving %d..." % i,
86 page = urlread("http://xkcd.com/%d/" % i)
87 image = extract_string(page,
88 "<h3>Image URL (for hotlinking/embedding): ",
89 "</h3>")
90 filepath = image[len("http://imgs.xkcd.com/comics/"):]
91 filepath = "%s%.4d_%s" % (target_dir, i, filepath)
92
93 ipath = urlgrab(image, filepath)
94
95 print "got %s" % ipath
96 except:
97 print "Error"
98 print "An error occured while loading comic number: %d" % i
99 else:
100 i += 1
101
102 return range(nfrom, i + 1)
103
104
105 def main():
106 update_xkcd()
107
108 if __name__ == "__main__":
109 main()