Python http请求（读取代理ip列表）

最新推荐文章于 2024-01-03 18:23:51 发布

我是北上君

最新推荐文章于 2024-01-03 18:23:51 发布

阅读量3.4k

点赞数

分类专栏： python 文章标签： python

python 专栏收录该内容

4 篇文章

订阅专栏

本文介绍如何使用Python的BeautifulSoup4和urllib2库从网页抓取代理IP列表。通过发送HTTP请求并解析HTML源码，提取出有效的HTTP及HTTPS代理IP地址。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

例子：读取代理ip列表

知识点

beautifulsoup4
urllib & urllib2 & httplib2
网络请求
文件操作

#encoding=utf-8

# from fib import fibonacci
# 这个声明不会把整个fib模块导入到当前的命名空间中，它只会将fib里的fibonacci单个引入到执行这个声明的模块的全局符号表
from bs4 import BeautifulSoup
import urllib2
import urllib


# install beautifulsoup4的步骤
# curl http://www.crummy.com/software/BeautifulSoup/bs4/download/4.1/beautifulsoup4-4.1.2.tar.gz >> beautifulsoup4-4.1.2.tar.gz
# tar zxvf beautifulsoup4-4.1.2.tar.gz
# cd beautifulsoup4-4.1.2
# python setup.py install

# urllib2和urllib的区别（http://blog.youkuaiyun.com/dolphin_h/article/details/45296353）

# urllib2可以接受一个Request类的实例来设置URL请求的headers，urllib仅可以接受URL。这意味着，你不可以伪装你的User Agent字符串等
    # url = 'http://www.someserver.com/cgi-bin/register.cgi'
    # user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'# 将user_agent写入头信息
    # values = {'name' : 'who','password':'123456'}
    # headers = { 'User-Agent' : user_agent }
    # data = urllib.urlencode(values) #{'wd':'D_in'} => wd=D_in
    # req = urllib2.Request(url, data, headers)
    # response = urllib2.urlopen(req)
    # the_page = response.read()


    # url = r'http://www.renren.com/ajaxLogin'
    #
    # #创建一个cj的cookie的容器
    # cj = cookielib.CookieJar()
    # opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    # #将要POST出去的数据进行编码
    # data = urllib.urlencode({"email":email,"password":pass})
    # r = opener.open(url,data)
    # print cj

# urllib提供urlencode方法用来GET查询字符串的产生，而urllib2没有。这是为何urllib常和urllib2一起使用的原因。

# 如果你仅做HTTP相关的，看一下httplib2，比其他几个模块好用
    # def sendhttp():
    #     data = urllib.urlencode({'@number': 12524, '@type': 'issue', '@action': 'show'})
    #     headers = {"Content-type": "application/x-www-form-urlencoded",
    #                "Accept": "text/plain"}
    #     conn = httplib.HTTPConnection('bugs.python.org')
    #     conn.request('POST', '/', data, headers)
    #     httpres = conn.getresponse()
    #     print httpres.status
    #     print httpres.reason
    #     print httpres.read()
    #
    # if __name__ == '__main__':
    #     sendhttp()



of = open('proxy2.txt', 'w')

for page in range(1, 2):

    print "start", page
    # 伪装成浏览器进行访问
    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
    req = urllib2.Request(url='http://www.xici.net.co/nn', headers=headers)
    resp = urllib2.urlopen(req)

    # req = urllib.urlopen('http://www.xici.net.co/nn')
    # Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
    # User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36
    html_doc = resp.read()
    # 如果源码中中文为乱码，可以使用：print data.decode('u8')

    print 'response code:', resp.getcode()
    print 'response header:', resp.info()
    print 'resp content:', html_doc



    # html_doc = urllib2.urlopen('http://www.xici.net.co/nn/' + str(page)).read()

    # print html_doc
    # 数据内容（摘要）：
    #  <table id="ip_list">
    #     <tr>
    #       <th></th>
    #       <th>国家</th>
    #       <th>IP地址</th>
    #       <th>端口</th>
    #       <th>位置</th>
    #       <th>是否匿名</th>
    #       <th>类型</th>
    #       <th>速度</th>
    #       <th>连接时间</th>
    #       <th>验证时间</th>
    #     </tr>
    #     ....
    # </table>
    soup = BeautifulSoup(html_doc)

    trs = soup.find('table', id='ip_list').find_all('tr') #id为ip_list的table，查找其中的tr
    print "trs"

    for tr in trs[1:]:

        # <tr class="">
        #     <td></td>
        #     <td><img alt="Cn" src="http://fs.xicidaili.com/images/flag/cn.png"/></td>
        #     <td>114.105.216.240</td>
        #     <td>6675</td>
        #     <td>
        #             安徽亳州
        #           </td>
        #     <td>高匿</td>
        #     <td>socks4/5</td>
        #     <td>
        #     <div class="bar" title="1.413秒">
        #     <div class="bar_inner fast" style="width:86%">
        #     </div>
        #     </div>
        #     </td>
        #     <td>
        #     <div class="bar" title="0.412秒">
        #     <div class="bar_inner fast" style="width:92%">
        #     </div>
        #     </div>
        #     </td>
        #     <td>13-01-09 15:11</td>
        # </tr>
        print "----------------------------------------\n", tr

        tds = tr.find_all('td')
        ip = tds[2].text.strip()    # 第2个位ip
        port = tds[3].text.strip()
        protocol = tds[6].text.strip()

        print "=============:\n"
        print tds[2].text.strip(), ":", tds[3].text.strip(), " " ,protocol
        print "============="
        if protocol == 'HTTP' or protocol == 'HTTPS':
            of.write('%s=%s:%s\n' % (protocol, ip, port) )
            print '%s=%s:%s' % (protocol, ip, port)

of.close()