python get header repsonse high performance

原创于 2011-08-29 19:03:36 发布 · 108 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python

本文介绍了一个脚本，使用多线程技术并利用HTTP头部请求来批量检查网页链接的状态码，确保链接可用性。


from urlparse import urlparse
from threading import Thread
import httplib, sys
from Queue import Queue

concurrent = 200

def doWork():
    while True:
        url=q.get()
        status,url=getStatus(url)
        doSomethingWithResult(status,url)
        q.task_done()

def getStatus(ourl):
    try:
        url = urlparse(ourl)
        conn = httplib.HTTPConnection(url.netloc)   
        conn.request("HEAD", url.path)
        res = conn.getresponse()
        return res.status, ourl
    except:
        return "error", ourl

def doSomethingWithResult(status, url):
    print status, url

q=Queue(concurrent*2)
for i in range(concurrent):
    t=Thread(target=doWork)
    t.daemon=True
    t.start()
try:
    for url in open('urllist.txt'):
        q.put(url.strip())
    q.join()
except KeyboardInterrupt:
    sys.exit(1)