import asyncio import aiohttp from lxml import etree import logging import datetime import openpyxl wb = openpyxl.Workbook() sheet = wb.active sheet.append(['房源', '房子信息', '所在区域', '单价', '关注人数和发布时间', '标签']) logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s') start = datetime.datetime.now() class Spider(object): def __init__(self): self.semaphore = asyncio.Semaphore(6) # 信号量,控制协程数,防止爬的过快被反爬 self.header = { "Host": "sh.lianjia.com", "Referer": "https://sh.lianjia.com/ershoufang/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36" } async def scrape(self, url): async with self.semaphore: await asyncio.sleep(3) # 添加等待时间 session = aiohttp.ClientSession(headers=self.header) response = await session.get(url) result = await response.text() await session.close() return result async def scrape_index(self, page): url = f'https://sh.lianjia.com/ershoufang/pg{page}/' text = await self.scrape(url) await self.parse(text) async def parse(self, text): html = etree.HTML(text) lis = html.xpath('//*[@id="content"]/div[1]/ul/li') for li in lis: house_data = li.xpath('.//div[@class="title"]/a/text()')[0] # 房源 house_info = li.xpath('.//div[@class="houseInfo"]/text()')[0] # 房子信息 address = ' '.join(li.xpath('.//div[@class="positionInfo"]/a/text()')) # 位置信息 price = li.xpath('.//div[@class="priceInfo"]/div[2]/span/text()')[0] # 单价 元/平米 attention_num = li.xpath('.//div[@class="followInfo"]/text()')[0] # 关注人数和发布时间 tag = ' '.join(li.xpath('.//div[@class="tag"]/span/text()')) # 标签 sheet.append([house_data, house_info, address, price, attention_num, tag]) logging.info([house_data, house_info, address, price, attention_num, tag]) def main(self): # 100页的数据 scrape_index_tasks = [asyncio.ensure_future(self.scrape_index(page)) for page in range(1, 2)] loop = asyncio.get_event_loop() tasks = asyncio.gather(*scrape_index_tasks) loop.run_until_complete(tasks) if __name__ == '__main__': spider = Spider() spider.main() wb.save('house2.xlsx') delta = (datetime.datetime.now() - start).total_seconds() print("用时:{:.3f}s".format(delta))
python spider lianjia
最新推荐文章于 2025-04-29 09:22:12 发布