一、序言:
世界 1024 程序猿节日不加班,闲着没事儿。。。写了个快代理爬虫和播客访问量爬虫,每个步骤注释都很清晰,代码仅供学习参考!
---- Nick.Peng
二、所需环境:
- Python3.x
- 相关模块: requests、json、lxml、fake_useragent
三、快代理爬虫代码如下:
- 文件名: proxyip_spider.py
- 直接运行会在当前路径下保存一个名为 proxyip.json 的文件,里面保存的就是我们爬取的代理 IP
# -*- coding: utf-8 -*-
# @Author: Nick
# @Date: 2019-10-20 15:40:58
# @Last Modified by: Nick
# @Last Modified time: 2019-10-24 16:54:31
import requests
import json
import time
from lxml import etree
class Proxyip_spider(object):
"""docstring for proxyip_spider"""
def __init__(self):
# self.base_url = "https://www.kuaidaili.com/free/"
self.headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'}
self.url = "https://www.kuaidaili.com/free/inha/{}"
def send_request(self, page):
data = requests.get(self.url.format(page)).content.decode()
return data
def parse_data(self, page):
html_str = self.send_request(page)
# print(html_str)
element_obj = etree.HTML(html_str)
ip_type = element_obj.xpath(
'//*[@id="list"]/table/tbody/tr/td[4]/text()')
ip = element_obj.xpath('//*[@id="list"]/table/tbody/tr/td[1]/text()')
port = element_obj.xpath('//*[@id="list"]/table/tbody/tr/td[2]/text()')
data_li = list(zip(ip_type, ip, port))
# print(data_li)
return data_li
def save_data(self, data):
with open("proxyip.json", "a+") as f:
f.write(json.dumps(data))
def main(self):
proxy_data = list()
for p in range(10):
print("正在抓取{}页。。。".format(p + 1))
data_li = self.parse_data(p + 1)
for per_data in data_li:
ip_type, ip, port = per_data
proxy_data.append(dict(ip_type=ip_type, ip=ip + ":" + port))
# print(proxy_data)
time.sleep(1)
self.save_data(proxy_data)
print(proxy_data)
if __name__ == '__main__':
Proxyip_spider().main()
四、刷播客访问量代码如下:
- 文件名: auto_click.py
# -*- coding: utf-8 -*-
# @Author: Nick
# @Date: 2019-10-12 15:42:22
# @Last Modified by: Nick
# @Last Modified time: 2019-10-12 17:30:44
import requests
import json
import time
import random
import re
from fake_useragent import UserAgent
# 实例化UserAgent对象,用于产生随机UserAgent
ua = UserAgent()
verify_url = "http://www.baidu.com"
# url列表,包含要访问的播客网址
url_list = [
'https://blog.youkuaiyun.com/PY0312/article/details/102512274',
'https://blog.youkuaiyun.com/PY0312/article/details/102501233',
'https://blog.youkuaiyun.com/PY0312/article/details/102463238',
'https://blog.youkuaiyun.com/PY0312/article/details/101087356',
'https://blog.youkuaiyun.com/PY0312/article/details/96507072'
]
# 生成ip地址列表,以便使用代理ip访问,这里ip地址从txt文档中读取
def auto_click():
proxy_list = []
with open("proxyip.json") as f:
proxy_list = json.loads(f.read())
# print(proxy_list)
# print(type(proxy_list))
while True:
for proxy in proxy_list:
header = {'User_Agent': ua.random}
temp_proxy = {}
temp_proxy[proxy["ip_type"]] = proxy["ip"]
# print(type(temp_proxy))
try:
response = requests.get(
verify_url, headers=header, proxies=temp_proxy, timeout=3)
if response.status_code == 200:
temp_url = random.choice(url_list)
num = re.findall(r"/(\d+)", temp_url)
print("正在点击的文章: %s" % num)
requests.get(temp_url, headers=header, proxies=temp_proxy)
else:
print("IP: {} 不能使用,正常尝试下一个...".format(proxy.get("ip")))
time.sleep(0.1)
continue
except Exception as e:
print('=========wrong: {}========='.format(e))
time.sleep(0.1)
time.sleep(1)
if __name__ == "__main__":
auto_click()