【py网页】sitecopy代码

最新推荐文章于 2025-08-16 17:02:07 发布
转载最新推荐文章于 2025-08-16 17:02:07 发布 · 98 阅读
0 ·
CC 4.0 BY-SA版权
原文链接：http://www.cnblogs.com/lizunicon/p/3511509.html
文章标签：
#python
本文介绍了一个使用Python实现的网页抓取与站点复制工具，包括多线程处理、资源解压、链接解析等功能，并提供了详细的使用示例。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
001 #coding:utf-8
002 import re,os,shutil,sys
003 import urllib2,socket,cookielib
004 from threading import Thread,stack_size,Lock
005 from Queue import Queue
006 import time
007 from gzip import GzipFile
008 from StringIO import StringIO
009 
010 class ContentEncodingProcessor(urllib2.BaseHandler):
011   """A handler to add gzip capabilities to urllib2 requests """
012 
013   # add headers to requests
014   def http_request(self, req):
015     req.add_header("Accept-Encoding", "gzip, deflate")
016     return req
017 
018   # decode
019   def http_response(self, req, resp):
020     old_resp = resp
021     # gzip
022     if resp.headers.get("content-encoding") == "gzip":
023         gz = GzipFile(
024                     fileobj=StringIO(resp.read()),
025                     mode="r"
026                   )
027         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
028         resp.msg = old_resp.msg
029     # deflate
030     if resp.headers.get("content-encoding") == "deflate":
031         gz = StringIO( deflate(resp.read()) )
032         resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)  # 'class to add info() and
033         resp.msg = old_resp.msg
034     return resp
035 
036 # deflate support
037 import zlib
038 def deflate(data):   # zlib only provides the zlib compress format, not the deflate format;
039   try:               # so on top of all there's this workaround:
040     return zlib.decompress(data, -zlib.MAX_WBITS)
041   except zlib.error:
042     return zlib.decompress(data)
043 
044 class Fetcher:
045     '''
046     html Fetcher
047 
048     basic usage
049     -----------
050     from fetcher import Fetcher
051     f = Fetcher()
052     f.get(url)
053 
054     post
055     ----
056     req = urllib2.Request(...)
057     f.post(req)
058 
059     multi-thread
060     ------------
061     f = Fetcher(threads=10)
062     for url in urls:
063         f.push(url)
064     while f.taskleft()
065         url,html = f.pop()
066         deal_with(url,html)
067     '''
068     def __init__(self,timeout=10,threads=None,stacksize=32768*16,loginfunc=None):
069         #proxy_support = urllib2.ProxyHandler({'http':'http://localhost:3128'})
070         cookie_support = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
071         encoding_support = ContentEncodingProcessor()
072         #self.opener = urllib2.build_opener(cookie_support,encoding_support,proxy_support,urllib2.HTTPHandler)
073         self.opener = urllib2.build_opener(cookie_support,encoding_support,urllib2.HTTPHandler)
074         self.req = urllib2.Request('http://www.hsbc.com')
075         socket.setdefaulttimeout(timeout)
076         self.q_req = Queue()
077         self.q_ans = Queue()
078         self.lock = Lock()
079         self.running = 0
080         if loginfunc:
081             self.opener = loginfunc(self.opener)
082         if threads:
083             self.threads = threads
084             stack_size(stacksize)
085             for i in range(threads):
086                 t = Thread(target=self.threadget)
087                 t.setDaemon(True)
088                 t.start()
089 
090     def __del__(self):
091         time.sleep(0.5)
092         self.q_req.join()
093         self.q_ans.join()
094 
095     def taskleft(self):
096         return self.q_req.qsize()+self.q_ans.qsize()+self.running
097 
098     def push(self,req,repeat=3):
099         if not self.threads:
100             print 'no thread, return get instead'
101             return get(req,repeat)
102         self.q_req.put(req)
103 
104     def pop(self):
105         try:
106             data = self.q_ans.get(block=True,timeout=10)
107             self.q_ans.task_done()
108         except:
109             data = ['','']
110         return data
111 
112     def threadget(self):
113         while True:
114             req = self.q_req.get()
115             with self.lock:
116                 self.running += 1
117             ans = self.get(req)
118             print 'got',req
119             self.q_ans.put((req,ans))
120             try:
121                 self.q_req.task_done()
122             except:
123                 pass
124             with self.lock:
125                 self.running -= 1
126             time.sleep(0.1) # don't spam
127 
128     def proxyisworking(self):
129         try:
130             self.opener.open('http://www.hsbc.com').read(1024)
131             return True
132         except Exception , what:
133             print what
134             return False
135     def get(self,req,repeat=3):
136         '''
137         http GET req and repeat 3 times if failed
138         html text is returned when succeeded
139         '' is returned when failed
140         '''
141         try:
142             response = self.opener.open(req)
143             data = response.read()
144         except Exception , what:
145             print what,req
146             if repeat>0:
147                 return self.get(req,repeat-1)
148             else:
149                 print 'GET Failed',req
150                 return ''
151         return data
152 
153     def post(self,req,repeat=3):
154         '''
155         http POST req and repeat 3 times if failed
156         html text/True is returned when succeeded
157         False is returned when failed
158         '''
159         if not isinstance(req,urllib2.Request):
160             print 'post method need urllib.Request as argument'
161             return False
162         else:
163             r = self.get(req,repeat)
164             if r:
165                 return r
166             else:
167                 return True
168 
169 class SiteCopyer:
170     def __init__(self,url):
171         self.baseurl = url
172         self.home = self.baseurl.split('/')[2]
173         self.f = Fetcher(threads=10)
174         self.create_dir()
175 
176     def create_dir(self):
177         try:
178             shutil.rmtree(self.home)
179         except Exception,what:
180             print what
181         try:
182             os.mkdir(self.home)
183             os.mkdir(self.home+'/media')
184             os.mkdir(self.home+'/media/js')
185             os.mkdir(self.home+'/media/css')
186             os.mkdir(self.home+'/media/image')
187         except Exception,what:
188             print what
189 
190     def full_link(self,link,baseurl=None):
191         if not baseurl:
192             baseurl = self.baseurl
193         if '?' in link:
194             link = link.rsplit('?',1)[0]
195         if not link.startswith('http://'):
196             if link.startswith('/'):
197                 link = '/'.join(baseurl.split('/',3)[:3]) + link
198             elif link.startswith('../'):
199                 while link.startswith('../'):
200                     baseurl = baseurl.rsplit('/',2)[0]
201                     link = link[3:]
202                 link = baseurl+'/'+link
203             else:
204                 link = baseurl.rsplit('/',1)[0]+'/'+link
205         return link
206 
207     def link_alias(self,link):
208         link = self.full_link(link)
209         name = link.rsplit('/',1)[1]
210         if '.css' in name:
211             name = name[:name.find('.css')+4]
212             alias = '/media/css/'+name
213         elif '.js' in name:
214             name = name[:name.find('.js')+3]
215             alias = '/media/js/'+name
216         else:
217             alias = '/media/image/'+name
218         return alias
219 
220     def strip_link(self,link):
221         if link and (link[0] in ['"',"'"]):
222             link = link[1:]
223         while link and (link[-1] in ['"',"'"]):
224             link = link[:-1]
225         while link.endswith('/'):
226             link = link[:-1]
227         if link and (link[0] not in ["<","'",'"']) and ('feed' not in link):
228             return link
229         else:
230             return ''
231 
232     def copy(self):
233         page = self.f.get(self.baseurl)
234         links = re.compile(r'<link[^>]*href=(.*?)[ >]',re.I).findall(page)
235         links.extend( re.compile(r'<script[^>]*src=(.*?)[ >]',re.I).findall(page) )
236         links.extend( re.compile(r'<img[^>]*src=(.*?)[ >]',re.I).findall(page) )
237         templinks = []
238         for link in links:
239             slink = self.strip_link(link)
240             if slink:
241                 templinks.append(slink)
242         links = templinks
243         for link in set(links):
244             page = page.replace(link,self.link_alias(link)[1:])
245             self.f.push( self.full_link(link) )
246         open(self.home+'/index.html','w').write(page)
247         while self.f.taskleft():
248             url,page = self.f.pop()
249             if url.endswith('.css'):
250                 links = re.compile(r'url\([\'"]?(.*?)[\'"]?\)').findall(page)
251                 templinks = []
252                 for link in links:
253                     slink = self.strip_link(link)
254                     if slink:
255                         templinks.append(slink)
256                 links = templinks
257                 for link in set(links):
258                     self.f.push( self.full_link(link,url) )
259                     page = page.replace(link,self.link_alias(link)[1:].replace("media",".."))
260             print 'write to',self.home+self.link_alias(url)
261             try:
262                 open(self.home+self.link_alias(url),'w').write(page)
263             except Exception,what:
264                 print what
265 
266 if __name__ == "__main__":
267     if len(sys.argv) == 2:
268         url = sys.argv[1]
269         SiteCopyer(url).copy()
270     else:
271         print "Usage: python "+sys.argv[0]+" url"
转载于:https://www.cnblogs.com/lizunicon/p/3511509.html