python spider lianjia-优快云博客

import asyncio
import aiohttp
from lxml import etree
import logging
import datetime
import openpyxl

wb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['房源', '房子信息', '所在区域', '单价', '关注人数和发布时间', '标签'])
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
start = datetime.datetime.now()


class Spider(object):
    def __init__(self):
        self.semaphore = asyncio.Semaphore(6)  # 信号量，控制协程数，防止爬的过快被反爬
        self.header = {
            "Host": "sh.lianjia.com",
            "Referer": "https://sh.lianjia.com/ershoufang/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
        }

    async def scrape(self, url):
        async with self.semaphore:
            await asyncio.sleep(3)  # 添加等待时间
            session = aiohttp.ClientSession(headers=self.header)
            response = await session.get(url)
            result = await response.text()
            await session.close()
            return result

    async def scrape_index(self, page):
        url = f'https://sh.lianjia.com/ershoufang/pg{page}/'
        text = await self.scrape(url)
        await self.parse(text)

    async def parse(self, text):
        html = etree.HTML(text)
        lis = html.xpath('//*[@id="content"]/div[1]/ul/li')
        for li in lis:
            house_data = li.xpath('.//div[@class="title"]/a/text()')[0]  # 房源
            house_info = li.xpath('.//div[@class="houseInfo"]/text()')[0]  # 房子信息
            address = ' '.join(li.xpath('.//div[@class="positionInfo"]/a/text()'))  # 位置信息
            price = li.xpath('.//div[@class="priceInfo"]/div[2]/span/text()')[0]  # 单价 元/平米
            attention_num = li.xpath('.//div[@class="followInfo"]/text()')[0]  # 关注人数和发布时间
            tag = ' '.join(li.xpath('.//div[@class="tag"]/span/text()'))  # 标签
            sheet.append([house_data, house_info, address, price, attention_num, tag])
            logging.info([house_data, house_info, address, price, attention_num, tag])

    def main(self):
        # 100页的数据
        scrape_index_tasks = [asyncio.ensure_future(self.scrape_index(page)) for page in range(1, 2)]
        loop = asyncio.get_event_loop()
        tasks = asyncio.gather(*scrape_index_tasks)
        loop.run_until_complete(tasks)


if __name__ == '__main__':
    spider = Spider()
    spider.main()
    wb.save('house2.xlsx')
    delta = (datetime.datetime.now() - start).total_seconds()
    print("用时：{:.3f}s".format(delta))