python读取excel文件

文章描述了使用Python库如requests、openpyxl和xlrd抓取网页数据,然后通过openpyxl处理.xlsx和.xls文件中的数据,解决特殊设置单元格问题,最后清洗并提取有价值的数据的过程。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

 参考:https://www.cnblogs.com/wolfstark/p/16895823.html

import requests
from lxml import etree
import openpyxl
import xlrd
import pandas as pd
import os


'''
实例:http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/t20200525_12170746.html
.xlsx .xls
'''


headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Pragma": "no-cache",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": ""
}

url = "http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/t20200525_12170746.html"
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
element = etree.HTML(response.text)
lis = element.xpath('//div[@class="main_con_zw"]//p/a/@href')
one = 'http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/' + lis[0].strip()
two = 'http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/' + lis[1].strip()
print(one, '\n', two)

res1 = requests.get(one)
res2 = requests.get(two)
with open('xxx.xlsx', 'wb') as f, open('yyy.xls', 'wb') as p:
    f.write(res1.content)
    p.write(res2.content)


## 方案一:碰到有特殊设置的单元格,可能无法读取到真实的数值
def readExcel(filename):
    company_excel_list = []
    if '.xlsx' in filename:
        workbook = openpyxl.load_workbook(filename)  # 加载文件
        sheetnames = workbook.get_sheet_names()  # 获取所有sheet名称,通过名字的方式
        ws = workbook.get_sheet_by_name(sheetnames[0])  # 获取第一个sheet内容
        # 读取数据
        for r in range(3, ws.max_row + 1):
            old = ws.cell(r, 2).value
            if old and len(old) > 1:
                company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
        workbook.close()

    elif '.xls' in filename:
        workbook = xlrd.open_workbook(filename)  # 加载文件
        sheet = workbook.sheet_by_index(0)  # 获取第一个sheet内容
        # 读取数据
        for row_index in range(2, sheet.nrows):
            old = sheet.cell_value(row_index, 1)
            if old and len(old) > 1:
                company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
    os.remove(filename)
    return company_excel_list


## 方案二

### 样例1
def readExcel2(filename):
    company_excel_list = []
    df = pd.read_excel(filename, header=1)
    df.fillna('', inplace=True)
    old_list = df.iloc[:, 1].values
    for old in old_list:
        if old and len(old) > 1:
            company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
    os.remove(filename)
    return company_excel_list

filename = ['xxx.xlsx', 'yyy.xls']
for i in filename:
    company_excel_list = readExcel2(i)
    for company in company_excel_list:
        print(company)

### 样例2
class MyUtils:
    def read_excel(self, filename):
        arr_list = []
        df = pd.read_excel(filename, header=1)
        df.fillna('', inplace=True)
        old_list = df.iloc[:, [0, 8, 15]].values
        for old in old_list:
            if not old[1]:
                arr_list.append(old)
                continue
            pattern = r'[ \s\t\n\r\f\v\u00A0\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+'
            res = bool(re.search(pattern, old[1]))
            if res:
                arr_list.append(old)
        data_list = [arr.tolist() for arr in arr_list]
        print(data_list)
        return data_list


if __name__ == '__main__':
    utils = MyUtils()
    data_list = utils.read_excel('C:\\Users\\XX\\Desktop\\xxx.xlsx')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值