Image spider

From ActiveArchives

Jump to: navigation, search
import urllib2, urlparse, html5lib, lxml, sys, os
from lxml.cssselect import CSSSelector
 
useragent = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
url = "http://automatist.org"
exclude = "http://automatist.org/trac/ http://automatist.org/wiki/ http://automatist.org/blog/".split()
 
nexturls = [url]
visited = {}
base = url
 
while nexturls:
    url = nexturls.pop()
    print url
    try:
        request = urllib2.Request(url, None, {'User-Agent': useragent})
        f=urllib2.urlopen(request)
        visited[url] = True
        info = f.info()
        content_type = info.get("Content-Type")
        if content_type.startswith("text/html"):
            parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
            page = parser.parse(f)
 
            for elt in CSSSelector('a[href],img[src]')(page):
                try:
                    href = urlparse.urljoin(f.geturl(), elt.attrib['href'])
                except KeyError:
                    href = urlparse.urljoin(f.geturl(), elt.attrib['src'])
 
                if href.startswith("http") and href.startswith(base) and href not in visited:
                    if not [x for x in exclude if x.startswith(href)]:
                        nexturls.append(href)
 
        elif content_type in ("image/jpeg", "image/png", "image/gif"):
            print "\tDownloading"
            try:
                os.mkdir("dump")
            except OSError:
                pass
            o = open('dump/'+url.split('/')[-1], "wb")
            o.write(f.read())
            o.close()
 
    # Handle the many things that can go wrong in attempting to open a URL
    except urllib2.HTTPError, e:
        print "\t", e
    except ValueError, e:
        print "\t", e
Personal tools
Namespaces
Variants
Actions
Navigation
Toolbox