001 #coding:utf-8
002 import re,os,shutil,sys
003 import urllib2,socket,cookielib
004 from threading import Thread,stack_size,Lock
005 from Queue import Queue
006 import time
007 from gzip import GzipFile
008 from StringIO import StringIO
009
010 class ContentEncodingProcessor(urllib2.BaseHandler):
011 """A handler to add gzip capabilities to urllib2 requests """
012
013 # add headers to requests
014 def http_request(self, req):
015 req.add_header("Accept-Encoding", "gzip, deflate")
016 return req
017
018 # decode
019 def http_response(self, req, resp):
020 old_resp = resp
021 # gzip
022 if resp.headers.get("content-encoding") == "gzip":
023 gz = GzipFile(
024 fileobj=StringIO(resp.read()),
025 mode="r"
026 )
027 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
028 resp.msg = old_resp.msg
029 # deflate
030 if resp.headers.get("content-encoding") == "deflate":
031 gz = StringIO( deflate(resp.read()) )
032 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) # 'class to add info() and
033 resp.msg = old_resp.msg
034 return resp
035
036 # deflate support
037 import zlib
038 def deflate(data): # zlib only provides the zlib compress format, not the deflate format;
039 try: # so on top of all there's this workaround:
040 return zlib.decompress(data, -zlib.MAX_WBITS)
041 except zlib.error:
042 return zlib.decompress(data)
043
044 class Fetcher:
045 '''
046 html Fetcher
047
048 basic usage
049 -----------
050 from fetcher import Fetcher
051 f = Fetcher()
052 f.get(url)
053
054 post
055 ----
056 req = urllib2.Request(...)
057 f.post(req)
058
059 multi-thread
060 ------------
061 f = Fetcher(threads=10)
062 for url in urls:
063 f.push(url)
064 while f.taskleft()
065 url,html = f.pop()
066 deal_with(url,html)
067 '''
068 def __init__(self,timeout=10,threads=None,stacksize=32768*16,loginfunc=None):
069 #proxy_support = urllib2.ProxyHandler({'http':'http://localhost:3128'})
070 cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
071 encoding_support = ContentEncodingProcessor()
072 #self.opener = urllib2.build_opener(cookie_support,encoding_support,proxy_support,urllib2.HTTPHandler)
073 self.opener = urllib2.build_opener(cookie_support,encoding_support,urllib2.HTTPHandler)
074 self.req = urllib2.Request('http://www.hsbc.com')
075 socket.setdefaulttimeout(timeout)
076 self.q_req = Queue()
077 self.q_ans = Queue()
078 self.lock = Lock()
079 self.running = 0
080 if loginfunc:
081 self.opener = loginfunc(self.opener)
082 if threads:
083 self.threads = threads
084 stack_size(stacksize)
085 for i in range(threads):
086 t = Thread(target=self.threadget)
087 t.setDaemon(True)
088 t.start()
089
090 def __del__(self):
091 time.sleep(0.5)
092 self.q_req.join()
093 self.q_ans.join()
094
095 def taskleft(self):
096 return self.q_req.qsize()+self.q_ans.qsize()+self.running
097
098 def push(self,req,repeat=3):
099 if not self.threads:
100 print 'no thread, return get instead'
101 return get(req,repeat)
102 self.q_req.put(req)
103
104 def pop(self):
105 try:
106 data = self.q_ans.get(block=True,timeout=10)
107 self.q_ans.task_done()
108 except:
109 data = ['','']
110 return data
111
112 def threadget(self):
113 while True:
114 req = self.q_req.get()
115 with self.lock:
116 self.running += 1
117 ans = self.get(req)
118 print 'got',req
119 self.q_ans.put((req,ans))
120 try:
121 self.q_req.task_done()
122 except:
123 pass
124 with self.lock:
125 self.running -= 1
126 time.sleep(0.1) # don't spam
127
128 def proxyisworking(self):
129 try:
130 self.opener.open('http://www.hsbc.com').read(1024)
131 return True
132 except Exception , what:
133 print what
134 return False
135 def get(self,req,repeat=3):
136 '''
137 http GET req and repeat 3 times if failed
138 html text is returned when succeeded
139 '' is returned when failed
140 '''
141 try:
142 response = self.opener.open(req)
143 data = response.read()
144 except Exception , what:
145 print what,req
146 if repeat>0:
147 return self.get(req,repeat-1)
148 else:
149 print 'GET Failed',req
150 return ''
151 return data
152
153 def post(self,req,repeat=3):
154 '''
155 http POST req and repeat 3 times if failed
156 html text/True is returned when succeeded
157 False is returned when failed
158 '''
159 if not isinstance(req,urllib2.Request):
160 print 'post method need urllib.Request as argument'
161 return False
162 else:
163 r = self.get(req,repeat)
164 if r:
165 return r
166 else:
167 return True
168
169 class SiteCopyer:
170 def __init__(self,url):
171 self.baseurl = url
172 self.home = self.baseurl.split('/')[2]
173 self.f = Fetcher(threads=10)
174 self.create_dir()
175
176 def create_dir(self):
177 try:
178 shutil.rmtree(self.home)
179 except Exception,what:
180 print what
181 try:
182 os.mkdir(self.home)
183 os.mkdir(self.home+'/media')
184 os.mkdir(self.home+'/media/js')
185 os.mkdir(self.home+'/media/css')
186 os.mkdir(self.home+'/media/image')
187 except Exception,what:
188 print what
189
190 def full_link(self,link,baseurl=None):
191 if not baseurl:
192 baseurl = self.baseurl
193 if '?' in link:
194 link = link.rsplit('?',1)[0]
195 if not link.startswith('http://'):
196 if link.startswith('/'):
197 link = '/'.join(baseurl.split('/',3)[:3]) + link
198 elif link.startswith('../'):
199 while link.startswith('../'):
200 baseurl = baseurl.rsplit('/',2)[0]
201 link = link[3:]
202 link = baseurl+'/'+link
203 else:
204 link = baseurl.rsplit('/',1)[0]+'/'+link
205 return link
206
207 def link_alias(self,link):
208 link = self.full_link(link)
209 name = link.rsplit('/',1)[1]
210 if '.css' in name:
211 name = name[:name.find('.css')+4]
212 alias = '/media/css/'+name
213 elif '.js' in name:
214 name = name[:name.find('.js')+3]
215 alias = '/media/js/'+name
216 else:
217 alias = '/media/image/'+name
218 return alias
219
220 def strip_link(self,link):
221 if link and (link[0] in ['"',"'"]):
222 link = link[1:]
223 while link and (link[-1] in ['"',"'"]):
224 link = link[:-1]
225 while link.endswith('/'):
226 link = link[:-1]
227 if link and (link[0] not in ["<","'",'"']) and ('feed' not in link):
228 return link
229 else:
230 return ''
231
232 def copy(self):
233 page = self.f.get(self.baseurl)
234 links = re.compile(r'<link[^>]*href=(.*?)[ >]',re.I).findall(page)
235 links.extend( re.compile(r'<script[^>]*src=(.*?)[ >]',re.I).findall(page) )
236 links.extend( re.compile(r'<img[^>]*src=(.*?)[ >]',re.I).findall(page) )
237 templinks = []
238 for link in links:
239 slink = self.strip_link(link)
240 if slink:
241 templinks.append(slink)
242 links = templinks
243 for link in set(links):
244 page = page.replace(link,self.link_alias(link)[1:])
245 self.f.push( self.full_link(link) )
246 open(self.home+'/index.html','w').write(page)
247 while self.f.taskleft():
248 url,page = self.f.pop()
249 if url.endswith('.css'):
250 links = re.compile(r'url\([\'"]?(.*?)[\'"]?\)').findall(page)
251 templinks = []
252 for link in links:
253 slink = self.strip_link(link)
254 if slink:
255 templinks.append(slink)
256 links = templinks
257 for link in set(links):
258 self.f.push( self.full_link(link,url) )
259 page = page.replace(link,self.link_alias(link)[1:].replace("media",".."))
260 print 'write to',self.home+self.link_alias(url)
261 try:
262 open(self.home+self.link_alias(url),'w').write(page)
263 except Exception,what:
264 print what
265
266 if __name__ == "__main__":
267 if len(sys.argv) == 2:
268 url = sys.argv[1]
269 SiteCopyer(url).copy()
270 else:
271 print "Usage: python "+sys.argv[0]+" url"