import urllib2, urlparse, html5lib, lxml, sys, os
from lxml.cssselect import CSSSelector
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
url = "http://automatist.org"
exclude = "http://automatist.org/trac/ http://automatist.org/wiki/ http://automatist.org/blog/".split()
nexturls = [url]
visited = {}
base = url
while nexturls:
url = nexturls.pop()
print url
try:
request = urllib2.Request(url, None, {'User-Agent': useragent})
f=urllib2.urlopen(request)
visited[url] = True
info = f.info()
content_type = info.get("Content-Type")
if content_type.startswith("text/html"):
parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
page = parser.parse(f)
for elt in CSSSelector('a[href],img[src]')(page):
try:
href = urlparse.urljoin(f.geturl(), elt.attrib['href'])
except KeyError:
href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
if href.startswith("http") and href.startswith(base) and href not in visited:
if not [x for x in exclude if x.startswith(href)]:
nexturls.append(href)
elif content_type in ("image/jpeg", "image/png", "image/gif"):
print "\tDownloading"
try:
os.mkdir("dump")
except OSError:
pass
o = open('dump/'+url.split('/')[-1], "wb")
o.write(f.read())
o.close()
# Handle the many things that can go wrong in attempting to open a URL
except urllib2.HTTPError, e:
print "\t", e
except ValueError, e:
print "\t", e