爬虫 crawler 入门爬取不设防网页并实现无限增生-优快云博客

本文链接：https://blog.youkuaiyun.com/Sr6220033/article/details/146383834

基础版本

爬取网页后直接将前端html代码不加处理的输出

# pip3 install requests
import requests

# request the target URL
def crawler():
    response = requests.get("https://www.scrapingcourse.com/ecommerce/")
    response.raise_for_status()
    print(response.text)

# execute the crawler
crawler()

无限增生的爬虫

从第一个链接开始，记录已经遍历过的链接；
并且从这个链接爬取的html代码中记录 a[href] 的链接，存储到将要遍历的列表；
对于已经爬取的链接，直接continue处理

# pip3 install requests
import requests

def crawler():
    while urls_to_visit:

        # get the page to visit from the list
        current_url = urls_to_visit.pop(0)
        print(current_url)
        if current_url in visited_urls:
            continue
        # 记录访问过的url到列表中
        visited_urls.add(current_url)

        try:
            response = requests.get(current_url, timeout=5)  # 设置超时时间，避免死循环
            response.raise_for_status()  # 检查请求是否成功
        except requests.RequestException as e:
            print(f"请求失败: {current_url}, 错误: {e}")
            continue

        # parse the HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # collect all the links
        link_elements = soup.select("a[href]")
        for link_element in link_elements:
            url = link_element["href"]

            if url.startswith("#"):
                continue  # ignore internal links

            # convert links to absolute URLs
            if not url.startswith("http"):
                absolute_url = requests.compat.urljoin(target_url, url)
            else:
                absolute_url = url

            # ensure the crawled link belongs to the target domain and hasn't been visited
            if (
                absolute_url.startswith(target_url)
                and absolute_url not in urls_to_visit
            ):
                urls_to_visit.append(url)

# pip3 install requests beautifulsoup4

from bs4 import BeautifulSoup



target_url = "https://www.scrapingcourse.com/ecommerce/"
# initialize the list of discovered URLs
urls_to_visit = [target_url]
visited_urls = set()  # 记录已访问的 URL，防止重复爬取
# execute the crawler
crawler()

无限增生的效果

在这里插入图片描述
部分链接爬取失败后会返回错误信息

变式：爬取github android cve链接

import requests
from bs4 import BeautifulSoup
import csv
import json
import time

# GitHub Android CVE 搜索页面
base_url = "https://github.com/search?q=android%20cve&type=repositories"

# 存储要访问的 URL
urls_to_visit = [base_url]

# 记录已访问的 URL，防止重复爬取
visited_urls = set()

# 存储爬取的 GitHub 仓库信息
repo_data = []

# 设置请求头
headers = {"User-Agent": "Mozilla/5.0"}


def crawler():
    while urls_to_visit:
        current_url = urls_to_visit.pop(0)

        # 如果已经访问过，则跳过
        if current_url in visited_urls:
            continue

        print(f"正在访问: {current_url}")
        visited_urls.add(current_url)

        try:
            # 发送请求
            response = requests.get(current_url, headers=headers)
            response.raise_for_status()

            # 解析 HTML
            soup = BeautifulSoup(response.content, "html.parser")

            # 提取仓库信息
            repo_containers = soup.find_all("div", class_="search-title")

            for repo in repo_containers:
                # 获取仓库名称和链接
                link = repo.find("a")
                if not link:
                    continue

                repo_name = link.get_text(strip=True)
                repo_url = "https://github.com" + link["href"]

                # 获取描述
                description_tag = repo.find_next("p")
                repo_description = description_tag.get_text(strip=True) if description_tag else "无描述"

                # 获取 Star 数
                star_tag = repo.find_next("a", class_="Link--muted")
                repo_stars = star_tag.get_text(strip=True) if star_tag else "0"

                # 存储数据
                repo_data.append(
                    {
                        "Name": repo_name,
                        "URL": repo_url,
                        "Description": repo_description,
                        "Stars": repo_stars,
                    }
                )

            # 查找下一页链接
            next_page_link = soup.find('a', string='Next')  # 查找文本为'Next'的<a>标签
            if next_page_link and 'href' in next_page_link.attrs:
                next_url = next_page_link['href']
                urls_to_visit.append(next_url)
                print(f"发现下一页: {next_url}")

            # 防止爬取过快，被 GitHub 限制
            time.sleep(2)

        except Exception as e:
            print(f"访问 {current_url} 时出错: {e}")
            continue


# 执行爬取
crawler()

# 打印 JSON 数据
print(json.dumps(repo_data, indent=4, ensure_ascii=False))

# 保存到 CSV 文件
csv_filename = "github_android_cve.csv"
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Name", "URL", "Description", "Stars"])
    writer.writeheader()
    writer.writerows(repo_data)
    print(f"数据已成功保存到 {csv_filename}")