#!/usr/bin/env python

#http://web.archive.org/web/20060821103537/www.philoticweb.net/

import sys, urllib
import re
import os
import os.path
import urlparse
import shutil

if len(sys.argv) != 4:
    print "Usage archiveorg_sitebackup.py outdir basepath lasttime"
    exit(1)

outdir = sys.argv[1]
basepath = sys.argv[2]
timestamp = int(sys.argv[3])

laterdir = os.path.join(outdir, "trylater")

try:
    os.makedirs(outdir)
except OSError:
    if not os.path.isdir(outdir):
        print sys.exc_info()
        sys.exc_clear()
        print "Aborting at %s" % outdir
        exit(1)

try:
    os.mkdir(laterdir)
except OSError:
    if not os.path.isdir(laterdir):
        print sys.exc_info()
        sys.exc_clear()
        print "Aborting at %s" % laterdir
        exit(1)
        

urlstack = []
ignorelist = []
datelookup = {}
urlexp = re.compile(r'http://web.archive.org/web/([0-9]{14})/(.*)')
fullurl = urlfinder = re.compile(r'(src|href|codebase|data|archive|background|action|url)(\(\'|\s*=\s*\")(.*)[\'\"]', re.IGNORECASE)

jsgarbageexp = re.compile("</BODY>\s*(<SCRIPT.*</SCRIPT>)", re.IGNORECASE | re.DOTALL)
basegarbageexp = re.compile(r"<BASE\s+HREF=\".+way_back_stub.+\"\s*>", re.IGNORECASE)

def makearchiveurl(timestamp, basepath):
    return "http://web.archive.org/web/%d/%s" % (timestamp, basepath)

def mediafile(name):
    return name.split('.')[-1].lower() in ['gif', 'jpeg', 'jpg', 'bmp', 'tiff', 'tif', 'png', 'mov', 'avi', 'divx' 'tgz', 'tar', 'rar', 'zip', 'gz', 'bz2', '7z', 'mp3', 'ogg', 'wav', 'avi', 'mp4', 'mpeg', 'mpg', 'flv', 'swf', 'sit', 'sitx']

def nexturl():
    global urlstack, outdir, basepath, timestamp, datelookup, urlexp, fullurl, pageexp
    if not len(urlstack):
        return None
    graburl = urlstack.pop()
    date, name = urlexp.findall(graburl)[0]
    if name[:7].lower() == "http://":
        name = name[7:]
    if os.path.basename(name) == "":
            name = os.path.join(name, "index.html")
        
    if name in datelookup and (int(date) <= datelookup[name] or int(date) >= timestamp):
        print "Skipping %s" % name #don't process stuff we've already seen, or that's too recent
    else:
        print "Processing %s" % name
        datelookup[name] = int(date)
        print "outdir %s" % outdir
        print "subdir %s" % os.path.dirname(name)
        fulldir = os.path.join(outdir, os.path.dirname(name))
        print "Fulldir %s" % fulldir
        try:
            os.makedirs(fulldir)
            print "created dir"
        except OSError:
            if not os.path.isdir(fulldir):
                print sys.exc_info()
                sys.exc_clear()
                print "Aborting at %s" % name
                exit(1)
        if os.path.isdir(os.path.join(outdir, name)):
            name = "%s/index.html" % name
        if os.path.exists(os.path.join(outdir, name)):
            print "%s already exists" % os.path.join(outdir, name)
            lhandle = open(os.path.join(outdir, name), 'r')
            data = lhandle.read()
            lhandle.close()
        else:        
            print "writing %s from %s" % (os.path.join(outdir, name), graburl)
            conn = urllib.urlopen(graburl)
            data = conn.read()
            conn.close()
            
        if "No archived versions of the page you requested are available" not in data:
            if "Your request failed to connect to our servers" in data or "We were unable to retrieve the requested data" in data:
                newpath = os.path.join(laterdir, os.path.basename(name))
                if os.path.exists(os.path.join(outdir, name)):
                    shutil.move(os.path.join(outdir, name), newpath)
                else:
                    if not os.path.exists(newpath):
                        lhandle = open(newpath)
                        lhandle.write(data)
                        lhandle.close()
                ignorelist.append(graburl)
                return len(urlstack)
            else:
                lhandle = open(os.path.join(outdir, name), 'w')
                lhandle.write(data)
                lhandle.close()
        else:
            print "archive.org Way-Back Machine was missing %s, so we're ignoring it in our search" % name
            if os.path.isfile(os.path.join(outdir, name)):
                print "oops, it was already written to disk, so we're deleting it too"
                os.remove(os.path.join(outdir, name))
            ignorelist.append(graburl)
            return len(urlstack)
            
        #garbage removal
        jsgarbage = jsgarbageexp.findall(data)
        if len(jsgarbage):
            data = data.replace(jsgarbage[0], "")
        basegarbage = basegarbageexp.findall(data)
        if len(basegarbage):
            data = data.replace(basegarbage[0], "")
        lhandle = open(os.path.join(outdir, name), 'w')
        lhandle.write(data)
        lhandle.close()
        
        if not mediafile(os.path.basename(name)):
            dirtyurls = zip(*fullurl.findall(data))
            if len(dirtyurls):
                dirtyurls = dirtyurls[-1]
                for url in dirtyurls:
                    url = url.split('"')[0]
                    url = url.split('?')[0]
                    url = urlparse.urljoin("http://%s" % name, url)
                    if url[-14:-1].lower() == "way_back_stub":
                        continue
                    elif basepath in url:
                        if not len(urlexp.findall(url)):
                            url = makearchiveurl(timestamp, url)
                    else:
                        print "skipping %s" % url
                        continue
                    if (url not in urlstack) and (url not in ignorelist):
                        urlstack.append(url) 
    return len(urlstack)
        
urlstack.append(makearchiveurl(timestamp, basepath))
print urlstack
while nexturl():
    print "%d in url stack" % len(urlstack)

                
exit(0)