Friday, September 19, 2008

Python Incremental Downloader Script

This is a script that will incremental download all files on a website. It accumulates all names and sends you an email to user@localhost of files that were downloaded and kept. Which ones were downloaded and deleted due to a wrong file size. etc... Make modification/s where necessary. This is something that worked for me. Most sites don't need the session variable passed via query string. For the ones that do this script work great. You may come across sites that require storage of cookie. The most feasible way I can think of that you would accomplish this is if you used pyCurl. You can also add this functionality to a bot that will download files and zip them after reaching a certain directory space limit. For easy download via DCC(in python) or something. A lot of possibilities here. Depending on the site and your connection you make get in extent of 10GB per day download speeds. Of course there may be a better way to implement the functionality. This is one way to do it. You can add more functions named start_ and do what you will with the data. Such as img, form or input fields. Nuff said. Enjoy this script. It not mine really and you can do what you want with it. No Restrictions and all that mumbo jumbo.

NOTE: Modification is required to use this script.


#!/usr/bin/env python
#download all txt files

from sgmllib import SGMLParser
import os,sys,urllib,string

class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []

def start_a(self, attrs):
href = [v for k, v in attrs if k == 'href']
if href:
self.urls.extend(href)

class file_downloader(object):
def __init__(self):
self.c="" #used to hold session string
self.modu=150 # perform session check every modulus == 0
self.h="http://www.example.org" #used to get session
self.u="http://www.example.org/download.php?id=" #used to get file via sess
self.d="file/" #dir to dl files to
self.mailer=True #send mail upon complete
self.to_email="user@localhost" #message are sent here if above True
self.ws=110 #wrong file size
self.messagerm="" #used to hold message sent of removed files
self.messagesv="" #used to hold message sent of saved files
self.messageimprm="" #used to hold message sent of impossible removed files
self.messageimpsv="" #used to hold message sent of impossible saved files
self.mfn="file_missed" #impossible files are stored in this file under specified dir
self.missed=[] #missed list
self.impossible=[] #impossible list
def session_var(self):
usock = urllib.urlopen(self.h)
parser = URLLister()
parser.feed(usock.read())
usock.close()
parser.close()
for url in parser.urls:
if string.find("".join(url),"download.php") != -1:
return url.split("&")[1]
def check_wrong_size(self):
d,mfn,missed,ws=self.d,self.mfn,self.missed,self.ws
if os.path.exists(d+mfn) == True:
mls=open(d+mfn).readline()
ml=eval(mls.split("\n")[0])
missed={}.fromkeys(ml).keys()

#os.listdir(os.getcwd()) #list all files in cwd
files=os.listdir(d)
for i in range(len(files)):
f=d+files[i]
if os.path.getsize(f) < int(ws):
print "removing file "+f
os.system("rm "+f)
missed.append(i)
os.system("echo \""+str(missed)+"\" > "+d+mfn)
""" run for loop and get """
def download_files(self,rans,rane):
d,u,mfn,missed,ws,modu=self.d,self.u,self.mfn,self.missed,self.ws,self.modu
messagerm,messagesv=self.messagerm,self.messagesv
messageimprm,messageimpsv=self.messageimprm,self.messageimpsv
c=self.session_var()
if os.path.exists(d+mfn) == True:
mls=open(d+mfn).readline()
ml=eval(mls.split("\n")[0])
missed={}.fromkeys(ml).keys()
for i in range(int(rans), int(rane)):
if i % int(modu) == 0:
c=self.session_var()
print i, c
urllib.urlretrieve(u+str(i)+"&"+c,d+str(i)+".txt")
if os.path.getsize(d+str(i)+".txt") < int(ws):
os.system("rm "+d+str(i)+".txt")
missed.append(str(i))
messagerm += d+str(i)+".txt "
else:
messagesv += d+str(i)+".txt "

#retry impossible & missed files
m=missed
impossible,to_email=self.impossible,self.to_email
for i in range(len(m)):
c=self.session_var()
#mission impossible?
urllib.urlretrieve(u+str(m[i])+"&"+c,d+str(m[i])+".txt")
if os.path.getsize(d+str(m[i])+".txt") < int(ws):
os.system("rm "+d+str(m[i])+".txt")
impossible.append(str(m[i]))
messageimprm += d+str(m[i])+".txt "
else:
messageimpsv += d+str(m[i])+".txt "
os.system("echo \""+str(impossible)+"\" > "+d+mfn)
if self.mailer == True:
os.system("echo \""+messagerm+"\" > "+d+"filerm")
os.system("mail -s 'file removed' "+to_email+" < "+d+"filerm")
os.system("rm "+d+"filerm")

os.system("echo \""+messagesv+"\" > "+d+"filesv")
os.system("mail -s 'file saved' "+to_email+" < "+d+"filesv")
os.system("rm "+d+"filesv")

os.system("echo \""+messageimprm+"\" > "+d+"filerm")
os.system("mail -s 'file impossible removed' "+to_email+" < "+d+"filerm")
os.system("rm "+d+"filerm")

os.system("echo \""+messageimpsv+"\" > "+d+"filesv")
os.system("mail -s 'file impossible saved' "+to_email+" < "+d+"filesv")
os.system("rm "+d+"filesv")
if __name__ == "__main__":
a=file_downloader()
if len(sys.argv) < 2:
a.check_wrong_size()
else:
rans=sys.argv(1) #range start 250
rane=sys.argv(2) #range end 432
a.download_files(rans,rane)

No comments: