普通版本:
from time import sleep
from lxml import etree
import requests
import json
import random
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
count = 0
dic = {"foo": []}
# 需要自己配置代理地址
# poxys = [
# {"HTTP": "117.88.5.253:3000"},
# ]
start = time.time()
# 可以在这里配置需要爬取的页码数
for i in range(1, 11):
# 使用代理
# a = random.choice(poxys)
# page_text = requests.get(url="https://www.autohome.com.cn/all/%s/" % i, #headers=headers, proxies=a)
# 不使用代理
page_text = requests.get(url="https://www.autohome.com.cn/all/%s/" % i, headers=headers)
page_text.encoding = 'gbk'
tree = etree.HTML(page_text.text)
ul_list = tree.xpath('//*[@id="auto-channel-lazyload-article"]/ul')
for ul in ul_list:
count += 1
print(count)
for li in ul:
dic1 = {}
title_list = li.xpath('./a/h3/text()')
if title_list:
title = title_list[0]
print(title)
url = li.xpath('./a/@href')[0]
content = li.xpath('./a/p/text()')[0]
img_url = li.xpath('./a/div[1]/img/@src')[0]
sort = li.xpath('./a/@href')[0].split("/")[3]
dic1["title"] = title
dic1["url"] = url
dic1["content"] = content
dic1["img_url"] = img_url
dic1["sort"] = sort
dic["foo"].append(dic1)
# 保存相关数据
# if count == 20 or i == 10:
# fp = open(f'{i}.txt', 'w')
# print('正在写入文件')
# fp.write(json.dumps(dic, indent=4))
# print('写入成功')
# fp.close()
#
# count = 0
# dic = {"foo": []}
end = time.time()
print(end-start)
协程版本:
# date : 2020/5/19 15:00
from gevent import monkey
# 从gevent库里导入monkey模块。
monkey.patch_all()
# monkey.patch_all()能把程序变成协作式运行,就是可以帮助程序实现异步。
import gevent,time,requests
# 导入gevent、time、requests
from gevent.queue import Queue
# 从gevent库里导入queue模块
from lxml import etree
import random
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
# 如果需要代理,可以在这里配置
# poxys = [
# {"HTTP": "117.88.5.253:3000"},
# ]
start = time.time()
# 记录开始时间
# 要爬取的网站
urls = []
for i in range(2600, 5200):
urls.append('https://www.autohome.com.cn/all/%s/' % i)
work = Queue()
# 创建队列对象,并赋值给work。
for url in urls:
work.put_nowait(url)
# 用put_nowait()函数可以把网址都放进队列里
def crawler():
dic = {"foo": []}
while not work.empty():
# 当队列不是空的时候,就执行下面的程序。
url = work.get_nowait()
# 用get_nowait()函数可以把队列里的网址都取出。
# 用代理抓取网站内容
# a = random.choice(poxys)
# page_text = requests.get(url, headers=headers, proxies=a)
# 不用代理抓取网站内容
page_text = requests.get(url, headers=headers)
print(url, work.qsize(), page_text.status_code)
# 打印网址、队列长度、抓取请求的状态码
page_text.encoding = 'gbk'
tree = etree.HTML(page_text.text)
ul_list = tree.xpath('//*[@id="auto-channel-lazyload-article"]/ul')
for ul in ul_list:
for li in ul:
dic1 = {}
title_list = li.xpath('./a/h3/text()')
if title_list:
title = title_list[0]
url = li.xpath('./a/@href')[0]
content = li.xpath('./a/p/text()')[0]
img_url = li.xpath('./a/div[1]/img/@src')[0]
sort = li.xpath('./a/@href')[0].split("/")[3]
dic1["title"] = title
dic1["url"] = url
dic1["content"] = content
dic1["img_url"] = img_url
dic1["sort"] = sort
dic["foo"].append(dic1)
# print(title)
fp = open('qczj(2600-5199).txt', 'w')
fp.write(json.dumps(dic, indent=4))
fp.close()
task_list = []
# 创建空的任务列表
for i in range(5):
# 相当于创建了4个爬虫
task = gevent.spawn(crawler)
# 用gevent.spawn()函数创建执行crawler()函数的任务。
task_list.append(task)
# 往任务列表添加任务
gevent.joinall(task_list)
# 用gevent.joinall方法,执行任务列表里的所有任务,就是让爬虫开始爬取网站。
end = time.time()
# 结束时间
print(end-start)
# 总用时