python自用

python自用


from lxml import etree
import requests
import re
import time
import pandas as pd
from bs4 import BeautifulSoup as bs


def url_one(main_url):
    resp = requests.get(main_url)
    obj1 = re.compile(r"发展动态.*?<li>(?P<ul>.*?)</ul>", re.S)
    result1 = obj1.finditer(resp.text) ##查找
    #result1 = obj1.search(resp.text)
    ul = result1.group("ul")
    obj2 = re.compile(r"\">(?P<code>.*?)基金", re.S)
    obj3 = re.compile(r"<a href=\"(?P<href>.*?)\">", re.S)
    code_names_ = obj2.finditer(ul)
    hrefs = obj3.finditer(ul)
    code_name_list = []
    href_list = []
    for code_names in code_names_:
        code_name = code_names.group("code")
        code_name_list.append(code_name)

    for it_href in hrefs:
        href = it_href.group("href")
        href_list.append(href)
    resp.close()
    return code_name_list, href_list


def url_two(url2):
    hrefs_2_list = []
    obj_url2 = re.compile(r"&\">首页</a>(?P<ul>.*?)</div>", re.S)
    obj4 = re.compile(r"<a href=\"(?P<href>.*?)\"", re.S)
    resp2 = requests.get(url2)
    hrefs_2_ = obj_url2.finditer(resp2.text)
    for hrefs_2 in hrefs_2_:
        href_2 = hrefs_2.group("ul")

        href_2_urls_ = obj4.finditer(href_2)
        for href_2_urls in href_2_urls_:
            href_2_url = href_2_urls.group("href")
            href_2_url = href_2_url.replace("&amp;", "&")
            hrefs_2_list.append(href_2_url)
    hrefs_2_list = sorted(list(set(hrefs_2_list)))

    resp2.close()
    return hrefs_2_list


def url_three(url3):
    url = url3
    obj2 = re.compile(r"<a href=\"(?P<href>.*?)\" target", re.S)
    obj1 = re.compile(r"picnews_list.*?<li>(?P<ul>.*?)</ul>", re.S)
    resp = requests.get(url)
    result1 = obj1.search(resp.text)
    ul = result1.group("ul")
    hrefs_ = obj2.finditer(ul)
    href_list = []
    for hrefs in hrefs_:
        href = hrefs.group("href")
        href = href.replace("&amp;", "&")
        href_list.append(href)
    resp.close()
    return href_list


def url_four(url, df, code_name):
    date = ""
    com_name = ""
    mony = ""

    # obj2 = re.compile(r"<a href=\"(?P<href>.*?)\" target", re.S)
    # obj1 = re.compile(r"picnews_list.*?<li>(?P<ul>.*?)</ul>", re.S)
    resp = requests.get(url)
    html = etree.HTML(resp.text)
    fileName = html.xpath("/html/body/div[4]/div[2]/div[2]/b/text()")[0]
    pro_name = ""
    if "公示" in fileName:
        print(fileName, url)
        pass
    else:
        try:
            page = bs(resp.text, "html.parser")
            table = page.find("table").find_all("tr")
            i = 0
            for tr in table:
                i += 1
                tds = tr.find_all("td")
                if i == 1:
                    pro_name = tds[0].text
                if i >= 4:

                    date = tds[0].text
                    com_name = tds[1].text
                    mony = tds[2].text
                    print(date, com_name, mony)
                    df = my_df(df, date, com_name, mony, pro_name, fileName, url, code_name)
                else:
                    pro_name = "表格为图片无法识别"
                    df = my_df(df, date, com_name, mony, pro_name, fileName, url, code_name)
                    pass
        except:
            pass
    resp.close()
    return df


def my_df(df, date, com_name, mony, pro_name, fileName, url, code_name):
    my_judge = ""
    com_name = com_name.split("(")[0]
    if "公司" in com_name:
        my_judge = "企业"
    elif "街道办" in com_name or "局" in com_name:
        my_judge = "政府"
    elif "村委会" in com_name:
        my_judge = "群众自治组织"
    elif "基金" in com_name:
        my_judge = "非盈利组织"
    else:
        my_judge = "个人"

    my_se = pd.Series({"日期": date, "收支来源": com_name, "收入金额": mony, "收入类型": my_judge, "专项基金名称": pro_name
                          , "专项基金名称G": fileName, "网址": url}, name=code_name)
    df = df.append(my_se)
    time.sleep(1)
    print(df)
    return df



if __name__ == "__main__":
    
    do_main_url = ""
    main_url = ""
    excel_path = r"C:\wq1.xlsx"
    code_name_list, href_list_one = url_one(main_url)
    df = pd.DataFrame()

    for i, href in enumerate(href_list_one):
        url2 = main_url + href
        code_name = code_name_list[i]
        href_list_two = url_two(url2)
        for href_two in href_list_two:
            url3 = do_main_url + href_two
            href_three_list = url_three(url3)
            for href_three in href_three_list:
                url4 = do_main_url + href_three
                df = url_four(url4, df, code_name)
    df.to_excel(excel_path)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值