# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
# from selenium import webdriver
import subprocess as sp
from lxml import etree
import requests
import random
import re
#requests的Session可以自动保持cookie,不需要自己维护cookie内容
S = requests.Session()
#西祠代理高匿IP地址
target_url = 'http://www.66ip.cn/areaindex_15/1.html'
#完善的headers
target_headers = {'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer':'http://www.66ip.cn/areaindex_15/1.html',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
}
#get请求
target_response = S.get(url = target_url, headers = target_headers)
#utf-8编码
target_response.encoding = 'utf-8'
#获取网页信息
target_html = target_response.text
#获取id为ip_list的table
bf1_ip_list = BeautifulSoup(target_html, 'lxml')
bf2_ip_list = BeautifulSoup(str(bf1_ip_list.find_all(id='footer')), 'lxml')
ip_list_info = bf2_ip_list.table.contents
# 存储代理的列表
proxys_list = []
# 爬取每个代理信息
for index in range(len(ip_list_info)):
if index % 2 == 1 and index != 1:
dom = etree.HTML(str(ip_list_info[index]))
ip = dom.xpath('//td[1]')
port = dom.xpath('//td[2]')
protocol = dom.xpath('//td[3]')
proxys_list.append('http://' + '#' + ip[0].text + '#' + port[0].text)
print(proxys_list)
# 返回代理列表
25python成功输出第一个66免费ip代理
最新推荐文章于 2023-10-20 20:23:01 发布
2万+

被折叠的 条评论
为什么被折叠?



