import requests
from bs4 import BeautifulSoup
# 目标地址和头
# 得到的都是墙外的ip,问题就是要想登录目标地址必须先出去
url = 'https://www.sslproxies.org/'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
# 获取数据
def get_raw_data(url, headers):
response = requests.get(url, headers=headers)
raw_data = response.content.decode()
return raw_data
# 利用beautifulsoup处理
def bs_data(raw_data):
soup = BeautifulSoup(raw_data, features='lxml')
return soup
# 提取IP及其地址信息,返回列表
def bs_extract_ip(soup):
ip_data = soup.find_all('td', class_='', colspan='')
ip_list = []
for i in range(0, 400, 4):
ip_raw = ip_data[i].contents[0]
ip_port = ip_data[i+1].contents[0]
ip_address = ip_data[i+2].contents[0]
ip_feature = ip_data[i+3].contents[0]
ip_proxy = '{}:{}'.format(ip_raw, ip_port)
proxies = "{
{'http': 'http://{ip_input}', 'https': 'https://{
爬虫系列,(1),ip代理抓取 python
最新推荐文章于 2024-06-11 23:14:14 发布