前提:
本篇结合多线程threading模块来爬取,可以大大缩短下载时间
脚本
# coding:utf-8
import os
import time
import threading
import requests
from lxml import etree
cur_path = os.path.dirname(__file__)
file_path = os.path.join(cur_path, "image")
if not os.path.exists(file_path): os.mkdir(file_path)
class LoadImage():
def __init__(self, page_num=1, base_url="http://sj.zol.com.cn/bizhi/new_%d.html"):
"""
:param base_url: 爬取网页的url
:param page_num: 爬取第几页数
"""
# 这个网站貌似没有做什么限制,不加headers也是可以的,可以直接请求,以免爬取数据有限制,这里我加上了
self.s = requests.session()
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
"Upgrade-Insecure-Requests": "1"
}
self.base_url = base_url
self.page_num = page_num
def load_page(self):
url = (self.base_url