高考录取分数线爬虫

Python爬虫：快速抓取全国高校信息

最新推荐文章于 2024-07-10 23:25:22 发布

原创最新推荐文章于 2024-07-10 23:25:22 发布 · 1.1k 阅读

8 ·

CC 4.0 BY-SA版权

文章标签：

#python #json #restful

高考录取分数线专栏收录该内容

4 篇文章

订阅专栏

本文介绍了一个使用Python aiohttp库实现的并发爬虫，用于抓取全国5000所高校的基本信息，如学校ID、名称、层次、排名等，并保存到CSV文件中。通过设置并发限制和使用异步IO，提高了爬取效率。最后，对数据进行去重和排序，以便进一步分析。

# -*- coding: utf-8 -*-
'''
作者 : dy
开发时间 : 2021/6/15 17:15
'''
import aiohttp
import asyncio
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import time

current_path = Path.cwd()

def get_url_list(max_id):
    url = 'https://static-data.eol.cn/www/2.0/school/%d/info.json'
    not_crawled = set(range(max_id))
    if Path.exists(Path(current_path, 'college_info.csv')):
        df = pd.read_csv(Path(current_path, 'college_info.csv'))
        not_crawled -= set(df['学校id'].unique())
    return [url%id for id in not_crawled]


async def get_json_data(url, semaphore):
    async with semaphore:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
        }
        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False), trust_env=True) as session:
            try:
                async with session.get(url=url, headers=headers, timeout=6) as response:
                    # 更改相应数据的编码格式
                    response.encoding = 'utf-8'
                    # 遇到IO请求挂起当前任务，等IO操作完成执行之后的代码，当协程挂起时，事件循环可以去执行其他任务。
                    json_data = await response.json()
                    if json_data != '':
                        # print(f"{url} collection succeeded!")
                        return save_to_csv(json_data['data'])
            except:
                return None


def save_to_csv(json_info):
    save_info = {}
    save_info['学校id'] = json_info['school_id']              # 学校id
    save_info['学校名称'] = json_info['name']                  # 学校名字
    level = ""
    if json_info['f985'] == '1' and json_info['f211'] == '1':
        level += "985 211"
    elif json_info['f211'] == '1':
        level += "211"
    else:
        level += json_info['level_name']
    save_info['学校层次'] = level                               # 学校层次
    save_info['软科排名'] = json_info['rank']['ruanke_rank']    # 软科排名
    save_info['校友会排名'] = json_info['rank']['xyh_rank']     # 校友会排名
    save_info['武书连排名'] = json_info['rank']['wsl_rank']     # 武书连排名
    save_info['QS世界排名'] = json_info['rank']['qs_world']     # QS世界排名
    save_info['US世界排名'] = json_info['rank']['us_rank']      # US世界排名
    save_info['学校类型'] = json_info['type_name']              # 学校类型
    save_info['省份'] = json_info['province_name']              # 省份
    save_info['城市'] = json_info['city_name']                  # 城市名称
    save_info['所处地区'] = json_info['town_name']              # 所处地区
    save_info['招生办电话'] = json_info['phone']                # 招生办电话
    save_info['招生办官网'] = json_info['site']                 # 招生办官网


    df = pd.DataFrame(save_info, index=[0])

    header = False if Path.exists(Path(current_path, 'college_info.csv')) else True
    df.to_csv(Path(current_path, 'college_info.csv'), index=False, mode='a', header=header)


async def main(loop):
    # 获取url列表
    url_list =  get_url_list(5000)
    # 限制并发量
    semaphore = asyncio.Semaphore(500)
    # 创建任务对象并添加到任务列表中
    tasks = [loop.create_task(get_json_data(url, semaphore)) for url in url_list]
    # 挂起任务列表
    for t in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
        await t


if __name__ == '__main__':
    start = time.time()
    # 修改事件循环的策略
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    # 创建事件循环对象
    loop = asyncio.get_event_loop()
    # 将任务添加到事件循环中并运行循环直至完成
    loop.run_until_complete(main(loop))
    # 关闭事件循环对象
    loop.close()
    df = pd.read_csv(Path(current_path, 'college_info.csv'))
    df.drop_duplicates(keep='first', inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.sort_values('学校id', inplace=True)
    df.loc[df['软科排名'] == 0, '软科排名'] = 999
    df.to_csv(Path(current_path, 'college_info.csv'), index=False)
    print(f'采集完成，共耗时：{round(time.time() - start, 2) } 秒')+680

转自：1分钟爬取全国高校信息，制成大屏可视化！_俊红的数据分析之路的博客-优快云博客