NOTE: Modification is required to use this script.
#!/usr/bin/env python
#download all txt files
from sgmllib import SGMLParser
import os,sys,urllib,string
class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k == 'href']
if href:
self.urls.extend(href)
class file_downloader(object):
def __init__(self):
self.c="" #used to hold session string
self.modu=150 # perform session check every modulus == 0
self.h="http://www.example.org" #used to get session
self.u="http://www.example.org/download.php?id=" #used to get file via sess
self.d="file/" #dir to dl files to
self.mailer=True #send mail upon complete
self.to_email="user@localhost" #message are sent here if above True
self.ws=110 #wrong file size
self.messagerm="" #used to hold message sent of removed files
self.messagesv="" #used to hold message sent of saved files
self.messageimprm="" #used to hold message sent of impossible removed files
self.messageimpsv="" #used to hold message sent of impossible saved files
self.mfn="file_missed" #impossible files are stored in this file under specified dir
self.missed=[] #missed list
self.impossible=[] #impossible list
def session_var(self):
usock = urllib.urlopen(self.h)
parser = URLLister()
parser.feed(usock.read())
usock.close()
parser.close()
for url in parser.urls:
if string.find("".join(url),"download.php") != -1:
return url.split("&")[1]
def check_wrong_size(self):
d,mfn,missed,ws=self.d,self.mfn,self.missed,self.ws
if os.path.exists(d+mfn) == True:
mls=open(d+mfn).readline()
ml=eval(mls.split("\n")[0])
missed={}.fromkeys(ml).keys()
#os.listdir(os.getcwd()) #list all files in cwd
files=os.listdir(d)
for i in range(len(files)):
f=d+files[i]
if os.path.getsize(f) < int(ws):
print "removing file "+f
os.system("rm "+f)
missed.append(i)
os.system("echo \""+str(missed)+"\" > "+d+mfn)
""" run for loop and get """
def download_files(self,rans,rane):
d,u,mfn,missed,ws,modu=self.d,self.u,self.mfn,self.missed,self.ws,self.modu
messagerm,messagesv=self.messagerm,self.messagesv
messageimprm,messageimpsv=self.messageimprm,self.messageimpsv
c=self.session_var()
if os.path.exists(d+mfn) == True:
mls=open(d+mfn).readline()
ml=eval(mls.split("\n")[0])
missed={}.fromkeys(ml).keys()
for i in range(int(rans), int(rane)):
if i % int(modu) == 0:
c=self.session_var()
print i, c
urllib.urlretrieve(u+str(i)+"&"+c,d+str(i)+".txt")
if os.path.getsize(d+str(i)+".txt") < int(ws):
os.system("rm "+d+str(i)+".txt")
missed.append(str(i))
messagerm += d+str(i)+".txt "
else:
messagesv += d+str(i)+".txt "
#retry impossible & missed files
m=missed
impossible,to_email=self.impossible,self.to_email
for i in range(len(m)):
c=self.session_var()
#mission impossible?
urllib.urlretrieve(u+str(m[i])+"&"+c,d+str(m[i])+".txt")
if os.path.getsize(d+str(m[i])+".txt") < int(ws):
os.system("rm "+d+str(m[i])+".txt")
impossible.append(str(m[i]))
messageimprm += d+str(m[i])+".txt "
else:
messageimpsv += d+str(m[i])+".txt "
os.system("echo \""+str(impossible)+"\" > "+d+mfn)
if self.mailer == True:
os.system("echo \""+messagerm+"\" > "+d+"filerm")
os.system("mail -s 'file removed' "+to_email+" < "+d+"filerm")
os.system("rm "+d+"filerm")
os.system("echo \""+messagesv+"\" > "+d+"filesv")
os.system("mail -s 'file saved' "+to_email+" < "+d+"filesv")
os.system("rm "+d+"filesv")
os.system("echo \""+messageimprm+"\" > "+d+"filerm")
os.system("mail -s 'file impossible removed' "+to_email+" < "+d+"filerm")
os.system("rm "+d+"filerm")
os.system("echo \""+messageimpsv+"\" > "+d+"filesv")
os.system("mail -s 'file impossible saved' "+to_email+" < "+d+"filesv")
os.system("rm "+d+"filesv")
if __name__ == "__main__":
a=file_downloader()
if len(sys.argv) < 2:
a.check_wrong_size()
else:
rans=sys.argv(1) #range start 250
rane=sys.argv(2) #range end 432
a.download_files(rans,rane)
No comments:
Post a Comment