公益数据爬虫本
代码
"""
Created on Sat Jan 27 21:56:47 2018
@author: caofk
"""
from selenium import webdriver
from pyquery import PyQuery as pq
from selenium.webdriver.common.action_chains import ActionChains
import time
import re
import pandas as pd
browser = webdriver.Firefox()
root = "http://gongyi.qq.com/succor/project_list.htm"
browser.get(root)
time.sleep(5)
meta_info = pd.DataFrame()
for s_status in range(1,4):
choose = browser.find_element_by_css_selector("#s_status_text")
ActionChains(browser).move_to_element(choose).perform()
time.sleep(5)
choose = browser.find_element_by_css_selector("#s_status_list > li:nth-child(%d) > a:nth-child(1)" %s_status)
s_status_name = choose.text
choose.click()
for s_tid in range(2,7):
base_info = pd.DataFrame()
choose = browser.find_element_by_css_selector("#s_tid_text")
ActionChains(browser).move_to_element(choose).perform()
time.sleep(5)
choose = browser.find_element_by_css_selector("#s_tid_list > li:nth-child(%d) > a:nth-child(1)" %s_tid)
s_tid_name = choose.text
choose.click()
time.sleep(5)
page_info = browser.find_element_by_css_selector("#projectPages_wrap").text
total_rows = re.findall(r"(\d+)条",page_info)
page_num = re.findall(r"(\d+)页",page_info)
init_url = browser.current_url
base_info["s_status_name"] = [s_status_name]
base_info["s_status"] = [s_status]
base_info["s_tid_name"] = [s_tid_name]
base_info["s_tid"] = [70+s_tid-1]
base_info["row"] = total_rows
base_info["p"] = page_num
meta_info = pd.concat((meta_info,base_info))
url_pd = pd.DataFrame()
base_pd = pd.DataFrame()
info_pd = pd.DataFrame()
for index, row in meta_info.iterrows():
for col_name in meta_info.columns:
if col_name != "p":
base_pd[col_name] = [row[col_name]]
else:
for p in range(1,int(row[col_name])+1):
base_pd["p"] = p
base_pd["url"] = root+"#s_status=%d&s_tid=%d&p=%d" %(row["s_status"],row["s_tid"],p)
url_pd = pd.concat((base_pd,url_pd))
page_pd = pd.DataFrame()
base_pd = pd.DataFrame()
i = 0
for index, row in url_pd.iterrows():
i = i+1
print(i)
for col_name in url_pd.columns:
if col_name != "url":
base_pd[col_name] = [row[col_name]]
else:
base_pd[col_name] = [row[col_name]]
browser.get(row[col_name])
time.sleep(1)
base_pd["item"] = pq(browser.page_source)("#projectList_wrap").html()
page_pd = pd.concat((page_pd, base_pd))
item_pd = pd.DataFrame()
base_pd = pd.DataFrame()
const = [""]
i = 0
for index, row in page_pd.iterrows():
i = i+1
print(i)
for col_name in page_pd.columns:
if col_name != "item":
base_pd[col_name] = [row[col_name]]
else:
for item in pq(row[col_name])(".pro_li"):
text = pq(item).text().replace("|","").replace("\xa0","")
base_pd["公益标题"] = text.split("\n")[0]
base_pd["公益链接"] = 'http://gongyi.qq.com/succor/'+pq(item)(".titless").attr('href')
try:
base_pd["公益简介"] = re.findall(r'项目简介(.*?)筹款目标', text.replace("\n",""))
except Exception as E1:
base_pd["公益简介"] = const
try:
base_pd["筹款目标"] = re.findall(r'筹款目标(.*?)筹款时间', text.replace("\n",""))
except Exception as E2:
base_pd["筹款目标"] = const
try:
base_pd["筹款时间"] = re.findall(r'筹款时间(.*?)执 行 方', text.replace("\n",""))
except Exception as E3:
base_pd["筹款时间"] = const
try:
base_pd["执行方"] = re.findall(r'执 行 方(.*?)项目状态', text.replace("\n",""))
except Exception as E4:
base_pd["执行方"] = const
try:
base_pd["项目状态"] = re.findall(r'项目状态(.*?)已筹', text.replace("\n",""))
except Exception as E5:
base_pd["项目状态"] = const
try:
base_pd["筹款情况"] = re.findall(r'已筹:(.*?)人次捐款', text.replace("\n",""))
except Exception as E6:
base_pd["筹款情况"] = const
try:
base_pd["筹款进度"] = re.findall(r'人次捐款(.*?)我要捐款', text.replace("\n",""))
except Exception as E1:
base_pd["筹款进度"] = const
item_pd = pd.concat((item_pd, base_pd))
item_pd.to_csv("E:\\公益数据.csv")
数据结果
