# 公益数据爬虫本

本文介绍了一种使用Python Selenium和PyQuery实现的腾讯公益网站数据爬取方法。该爬虫可以自动选择不同的筛选条件,抓取包括公益项目名称、链接等在内的详细信息,并将数据保存为CSV文件。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

公益数据爬虫本

代码

# -*- coding: utf-8 -*-
"""
Created on Sat Jan 27 21:56:47 2018

@author: caofk
"""

from selenium import webdriver
from pyquery import PyQuery as pq
from selenium.webdriver.common.action_chains import ActionChains
import time
import re
import pandas as pd

browser = webdriver.Firefox()
root = "http://gongyi.qq.com/succor/project_list.htm"
browser.get(root)
time.sleep(5)
meta_info = pd.DataFrame()
for s_status in range(1,4):
    choose = browser.find_element_by_css_selector("#s_status_text")
    ActionChains(browser).move_to_element(choose).perform()
    time.sleep(5)
    choose = browser.find_element_by_css_selector("#s_status_list > li:nth-child(%d) > a:nth-child(1)" %s_status)
    s_status_name = choose.text
    choose.click()
    for s_tid in range(2,7):
        base_info = pd.DataFrame()
        choose = browser.find_element_by_css_selector("#s_tid_text")
        ActionChains(browser).move_to_element(choose).perform()
        time.sleep(5)
        choose = browser.find_element_by_css_selector("#s_tid_list > li:nth-child(%d) > a:nth-child(1)" %s_tid)
        s_tid_name = choose.text
        choose.click()
        time.sleep(5)
        page_info = browser.find_element_by_css_selector("#projectPages_wrap").text
        total_rows = re.findall(r"(\d+)条",page_info) 
        page_num = re.findall(r"(\d+)页",page_info)
        init_url = browser.current_url
        base_info["s_status_name"] = [s_status_name]
        base_info["s_status"] = [s_status]
        base_info["s_tid_name"] = [s_tid_name]
        base_info["s_tid"] = [70+s_tid-1]
        base_info["row"] = total_rows
        base_info["p"] = page_num
        meta_info = pd.concat((meta_info,base_info))

url_pd = pd.DataFrame()
base_pd = pd.DataFrame()
info_pd = pd.DataFrame()
for index, row in meta_info.iterrows():
    for col_name in meta_info.columns:
        if col_name != "p":
                base_pd[col_name] = [row[col_name]]
        else:
            for p in range(1,int(row[col_name])+1):
                base_pd["p"] = p
                base_pd["url"] = root+"#s_status=%d&s_tid=%d&p=%d" %(row["s_status"],row["s_tid"],p)
                url_pd = pd.concat((base_pd,url_pd))


page_pd = pd.DataFrame()
base_pd = pd.DataFrame()
i = 0
for index, row in url_pd.iterrows():
    i = i+1
    print(i)
    for col_name in url_pd.columns:
        if col_name != "url":
            base_pd[col_name] = [row[col_name]]
        else:
            base_pd[col_name] = [row[col_name]]
            browser.get(row[col_name])
            time.sleep(1)
            base_pd["item"] = pq(browser.page_source)("#projectList_wrap").html()
            page_pd = pd.concat((page_pd, base_pd))

item_pd = pd.DataFrame()       
base_pd = pd.DataFrame()
const = [""]
i = 0
for index, row in page_pd.iterrows():
    i = i+1
    print(i)
    for col_name in page_pd.columns:
        if col_name != "item":
            base_pd[col_name] = [row[col_name]]
        else:
            for item in pq(row[col_name])(".pro_li"):
                text = pq(item).text().replace("|","").replace("\xa0","")
                base_pd["公益标题"] = text.split("\n")[0]
                base_pd["公益链接"] = 'http://gongyi.qq.com/succor/'+pq(item)(".titless").attr('href')
                try:
                    base_pd["公益简介"] = re.findall(r'项目简介(.*?)筹款目标', text.replace("\n",""))
                except Exception as E1:
                    base_pd["公益简介"] = const
                try:   
                    base_pd["筹款目标"] = re.findall(r'筹款目标(.*?)筹款时间', text.replace("\n",""))
                except Exception as E2:
                    base_pd["筹款目标"] = const
                try:   
                    base_pd["筹款时间"] = re.findall(r'筹款时间(.*?)执 行 方', text.replace("\n",""))
                except Exception as E3:
                    base_pd["筹款时间"] = const
                try:   
                    base_pd["执行方"] = re.findall(r'执 行 方(.*?)项目状态', text.replace("\n",""))
                except Exception as E4:
                    base_pd["执行方"] = const
                try:   
                    base_pd["项目状态"] = re.findall(r'项目状态(.*?)已筹', text.replace("\n",""))
                except Exception as E5:
                    base_pd["项目状态"] = const
                try:   
                    base_pd["筹款情况"] = re.findall(r'已筹:(.*?)人次捐款', text.replace("\n",""))
                except Exception as E6:
                    base_pd["筹款情况"] = const
                try:
                    base_pd["筹款进度"] = re.findall(r'人次捐款(.*?)我要捐款', text.replace("\n",""))
                except Exception as E1:
                    base_pd["筹款进度"] = const
                item_pd = pd.concat((item_pd, base_pd))

item_pd.to_csv("E:\\公益数据.csv") 

数据结果

这里写图片描述

宝贝回家(Project Hopeful),是一个寻找失踪儿童的公益组织,它的网站上确实有公开的数据可以用于研究或统计分析。如果你想要编写一个数据爬虫去抓取他们的成功案例信息,首先需要了解其网站数据结构和API(如果有的话)。以下是一个简单的步骤概述: 1. **开始学习**:熟悉Python语言,特别是`requests`库用于发送HTTP请求,以及如`BeautifulSoup`或`Scrapy`这样的HTML解析库。 2. **定位目标数据**:访问宝贝回家的官网,查找数据发布的页面或API文档。通常,这类信息可能在“帮助”、“数据”或“关于我们”等部分找到。 3. **分析网页结构**:使用浏览器的开发者工具检查目标页面的源码,确定成功案例数据的HTML标签和CSS类名。 4. **编写爬虫代码**: ```python import requests from bs4 import BeautifulSoup url = "https://www.babyhome.cn/successStories" # 假设这是成功案例列表页 response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') cases = soup.find_all('div', class_='success-story') # 这里假设每个案例在class为'success-story'的div内 for case in cases: title = case.find('h2').text detail = case.find('p').text # 或者根据实际结构提取详细信息 # ... 保存或处理数据 ``` 5. **数据清洗与存储**:将获取的数据清洗成统一的格式,并存储到CSV、JSON或其他适合的形式。 6. **遵守网站规定**:确保在爬取时尊重网站的robots.txt文件,同时遵循网站的使用条款。 注意:由于隐私保护和网站策略可能随时改变,直接爬取数据前最好先联系他们取得许可或寻求官方API支持。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值