python读取excel文件

world_in_world

已于 2025-04-21 21:22:34 修改

阅读量184

点赞数

CC 4.0 BY-SA版权

分类专栏： python基础文章标签： python excel

于 2023-10-09 14:36:41 首次发布

本文链接：https://blog.youkuaiyun.com/world_in_world/article/details/133701630

python基础专栏收录该内容

11 篇文章

订阅专栏

文章描述了使用Python库如requests、openpyxl和xlrd抓取网页数据，然后通过openpyxl处理.xlsx和.xls文件中的数据，解决特殊设置单元格问题，最后清洗并提取有价值的数据的过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

参考：https://www.cnblogs.com/wolfstark/p/16895823.html

import requests
from lxml import etree
import openpyxl
import xlrd
import pandas as pd
import os


'''
实例：http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/t20200525_12170746.html
.xlsx .xls
'''


headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Pragma": "no-cache",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": ""
}

url = "http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/t20200525_12170746.html"
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
element = etree.HTML(response.text)
lis = element.xpath('//div[@class="main_con_zw"]//p/a/@href')
one = 'http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/' + lis[0].strip()
two = 'http://zjt.hunan.gov.cn/zjt/bsfw/ggfw/xxcx/202005/' + lis[1].strip()
print(one, '\n', two)

res1 = requests.get(one)
res2 = requests.get(two)
with open('xxx.xlsx', 'wb') as f, open('yyy.xls', 'wb') as p:
    f.write(res1.content)
    p.write(res2.content)


## 方案一：碰到有特殊设置的单元格，可能无法读取到真实的数值
def readExcel(filename):
    company_excel_list = []
    if '.xlsx' in filename:
        workbook = openpyxl.load_workbook(filename)  # 加载文件
        sheetnames = workbook.get_sheet_names()  # 获取所有sheet名称，通过名字的方式
        ws = workbook.get_sheet_by_name(sheetnames[0])  # 获取第一个sheet内容
        # 读取数据
        for r in range(3, ws.max_row + 1):
            old = ws.cell(r, 2).value
            if old and len(old) > 1:
                company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
        workbook.close()

    elif '.xls' in filename:
        workbook = xlrd.open_workbook(filename)  # 加载文件
        sheet = workbook.sheet_by_index(0)  # 获取第一个sheet内容
        # 读取数据
        for row_index in range(2, sheet.nrows):
            old = sheet.cell_value(row_index, 1)
            if old and len(old) > 1:
                company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
    os.remove(filename)
    return company_excel_list


## 方案二

### 样例1
def readExcel2(filename):
    company_excel_list = []
    df = pd.read_excel(filename, header=1)
    df.fillna('', inplace=True)
    old_list = df.iloc[:, 1].values
    for old in old_list:
        if old and len(old) > 1:
            company_excel_list.append(old.strip().replace(' ', '').replace('\n', ''))
    os.remove(filename)
    return company_excel_list

filename = ['xxx.xlsx', 'yyy.xls']
for i in filename:
    company_excel_list = readExcel2(i)
    for company in company_excel_list:
        print(company)

### 样例2
class MyUtils:
    def read_excel(self, filename):
        arr_list = []
        df = pd.read_excel(filename, header=1)
        df.fillna('', inplace=True)
        old_list = df.iloc[:, [0, 8, 15]].values
        for old in old_list:
            if not old[1]:
                arr_list.append(old)
                continue
            pattern = r'[ \s\t\n\r\f\v\u00A0\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+'
            res = bool(re.search(pattern, old[1]))
            if res:
                arr_list.append(old)
        data_list = [arr.tolist() for arr in arr_list]
        print(data_list)
        return data_list


if __name__ == '__main__':
    utils = MyUtils()
    data_list = utils.read_excel('C:\\Users\\XX\\Desktop\\xxx.xlsx')