# 基本思路:
# 创建2个队列:页码队列和解析队列
# 2种线程:
# 爬虫线程:负责从页码队列中取出页码,拼接url,爬虫数据,存入解析队列
# 解析线程:负责从解析队列中提取需要的数据,并写入文件
# 注意使用队列锁来保证队列中的元素都被取完
import re
import json
import queue
import random
import requests
from time import sleep
from threading import Lock
from threading import Thread
from urllib.error import URLError
from lxml import etree
from fake_useragent import UserAgent
base_url = "https://sz.fang.lianjia.com/loupan/pg%d/"
ua = UserAgent()
# 设置全局解析线程退出标志
parse_exit_flag = False
lock = Lock()
class SpiderThread(Thread):
def __init__(self, id, page_queue, parse_queue, *args, **kwargs):
self.id = id
self.page_queue = page_queue
self.parse_queue = parse_queue
super().__init__(*args, **kwargs)
def run(self):
print(f"第{self.id}号线程开始")
while True:
# 退出条件
if self.page_queue.empty():
break
# 不停的从page_queue中取出页码,并访问网络
page = self.page_queue.get()
# 拼接url
url = base_url % page
headers = {
'User-Agent': ua.random
}
# 尝试4次
times = 4
while times > 0:
try:
response = requests.get(url, headers=headers)
print(f'第{self.id}号爬虫线程爬取了{url}')
self.parse_queue.put(response.text)
sleep(random.choice([1, 2]))
# 等所有的任务都做完task_done()
# 这句的位置很重要
self.page_queue.task_done()
break
except URLError as e:
print(e.reason)
finally:
times -= 1
# 解析
class ParseThread(Thread):
def __init__(self, id, parse_queue, fp, *args, **kwargs):
self.id = id
self.fp = fp
self.parse_queue = parse_queue
super().__init__(*args, **kwargs)
def run(self):
print(f"第{self.id}号解析线程开始解析")
global parse_exit_flag
while True:
if parse_exit_flag:
break
try:
# 从parse_queue中取一个数据,进行解析
resp = self.parse_queue.get(block=False)
# 封装一个解析方法进行解析并存入文件
self.parse(resp)
print(f'第{self.id}号解析线程解析成功')
self.parse_queue.task_done()
except queue.Empty:
pass
def parse(self, resp):
html = etree.HTML(resp)
li_list = html.xpath("//li[@class='resblock-list post_ulog_exposure_scroll has-results']")
items = []
for li in li_list:
img = li.xpath('.//img/@src')[0]
title = li.xpath('.//div[@class="resblock-name"]/a/text()')[0]
price = li.xpath('string(.//div[@class="main-price"])').replace('\n', '').strip()
price = re.sub('\s+', '', price)
item = {
'img': img,
'title': title,
'price': price
}
items.append(item)
with lock:
self.fp.write(json.dumps(items, ensure_ascii=False) + '\n')
def main():
# 创建2个队列
page_queue = queue.Queue(10)
parse_queue = queue.Queue(10)
# 给page-queue存入十个页面
for i in range(1, 11):
page_queue.put(i)
# 开启爬虫线程
for i in range(4):
SpiderThread(id=i, page_queue=page_queue, parse_queue=parse_queue).start()
fp = open("./lianjie.txt", 'a', encoding="utf-8")
# 启动解析线程
for i in range(4):
ParseThread(id=i, parse_queue=parse_queue, fp=fp).start()
# 队列锁
parse_queue.join()
page_queue.join()
# 修改全局退出标志
global parse_exit_flag
parse_exit_flag = True
fp.close()
if __name__ == '__main__':
main()
基本通用爬虫(链家)
最新推荐文章于 2025-05-22 21:31:22 发布