python自用
from lxml import etree
import requests
import re
import time
import pandas as pd
from bs4 import BeautifulSoup as bs
def url_one(main_url):
resp = requests.get(main_url)
obj1 = re.compile(r"发展动态.*?<li>(?P<ul>.*?)</ul>", re.S)
result1 = obj1.finditer(resp.text) ##查找
#result1 = obj1.search(resp.text)
ul = result1.group("ul")
obj2 = re.compile(r"\">(?P<code>.*?)基金", re.S)
obj3 = re.compile(r"<a href=\"(?P<href>.*?)\">", re.S)
code_names_ = obj2.finditer(ul)
hrefs = obj3.finditer(ul)
code_name_list = []
href_list = []
for code_names in code_names_:
code_name = code_names.group("code")
code_name_list.append(code_name)
for it_href in hrefs:
href = it_href.group("href")
href_list.append(href)
resp.close()
return code_name_list, href_list
def url_two(url2):
hrefs_2_list = []
obj_url2 = re.compile(r"&\">首页</a>(?P<ul>.*?)</div>", re.S)
obj4 = re.compile(r"<a href=\"(?P<href>.*?)\"", re.S)
resp2 = requests.get(url2)
hrefs_2_ = obj_url2.finditer(resp2.text)
for hrefs_2 in hrefs_2_:
href_2 = hrefs_2.group("ul")
href_2_urls_ = obj4.finditer(href_2)
for href_2_urls in href_2_urls_:
href_2_url = href_2_urls.group("href")
href_2_url = href_2_url.replace("&", "&")
hrefs_2_list.append(href_2_url)
hrefs_2_list = sorted(list(set(hrefs_2_list)))
resp2.close()
return hrefs_2_list
def url_three(url3):
url = url3
obj2 = re.compile(r"<a href=\"(?P<href>.*?)\" target", re.S)
obj1 = re.compile(r"picnews_list.*?<li>(?P<ul>.*?)</ul>", re.S)
resp = requests.get(url)
result1 = obj1.search(resp.text)
ul = result1.group("ul")
hrefs_ = obj2.finditer(ul)
href_list = []
for hrefs in hrefs_:
href = hrefs.group("href")
href = href.replace("&", "&")
href_list.append(href)
resp.close()
return href_list
def url_four(url, df, code_name):
date = ""
com_name = ""
mony = ""
# obj2 = re.compile(r"<a href=\"(?P<href>.*?)\" target", re.S)
# obj1 = re.compile(r"picnews_list.*?<li>(?P<ul>.*?)</ul>", re.S)
resp = requests.get(url)
html = etree.HTML(resp.text)
fileName = html.xpath("/html/body/div[4]/div[2]/div[2]/b/text()")[0]
pro_name = ""
if "公示" in fileName:
print(fileName, url)
pass
else:
try:
page = bs(resp.text, "html.parser")
table = page.find("table").find_all("tr")
i = 0
for tr in table:
i += 1
tds = tr.find_all("td")
if i == 1:
pro_name = tds[0].text
if i >= 4:
date = tds[0].text
com_name = tds[1].text
mony = tds[2].text
print(date, com_name, mony)
df = my_df(df, date, com_name, mony, pro_name, fileName, url, code_name)
else:
pro_name = "表格为图片无法识别"
df = my_df(df, date, com_name, mony, pro_name, fileName, url, code_name)
pass
except:
pass
resp.close()
return df
def my_df(df, date, com_name, mony, pro_name, fileName, url, code_name):
my_judge = ""
com_name = com_name.split("(")[0]
if "公司" in com_name:
my_judge = "企业"
elif "街道办" in com_name or "局" in com_name:
my_judge = "政府"
elif "村委会" in com_name:
my_judge = "群众自治组织"
elif "基金" in com_name:
my_judge = "非盈利组织"
else:
my_judge = "个人"
my_se = pd.Series({"日期": date, "收支来源": com_name, "收入金额": mony, "收入类型": my_judge, "专项基金名称": pro_name
, "专项基金名称G": fileName, "网址": url}, name=code_name)
df = df.append(my_se)
time.sleep(1)
print(df)
return df
if __name__ == "__main__":
do_main_url = ""
main_url = ""
excel_path = r"C:\wq1.xlsx"
code_name_list, href_list_one = url_one(main_url)
df = pd.DataFrame()
for i, href in enumerate(href_list_one):
url2 = main_url + href
code_name = code_name_list[i]
href_list_two = url_two(url2)
for href_two in href_list_two:
url3 = do_main_url + href_two
href_three_list = url_three(url3)
for href_three in href_three_list:
url4 = do_main_url + href_three
df = url_four(url4, df, code_name)
df.to_excel(excel_path)