import pandas as pd
import glob
import os
# 设置文件路径和读取所有12个数据文件(假设为Excel文件,可改为.csv)
file_path = "C:/Users/12203/Desktop/2017年12月武汉空气质量指数日历史数.xls" # 可根据实际路径修改
files = glob.glob(file_path)
# 定义目标列的标准名称(用于统一大小写)
target_columns = ['日期', 'AQI', 'PM2.5', 'PM10', 'SO2', 'NO2', 'CO', 'O3_8h']
column_mapping = {col.lower().replace('.', ''): col for col in target_columns}
all_data = []
for file in files:
# 读取数据,尝试自动识别编码(适用于csv)
if file.endswith('.csv'):
df = pd.read_csv(file)
else:
df = pd.read_excel(file)
# 将列名标准化:去空格、转小写、去除特殊字符用于映射
df.columns = [col.strip().lower().replace('.', '').replace('_', '') for col in df.columns]
# 映射回标准列名
rename_dict = {}
for col in df.columns:
if col in column_mapping:
rename_dict[col] = column_mapping[col]
df = df.rename(columns=rename_dict)
# 保留目标列中实际存在的列
existing_cols = [col for col in target_columns if col in df.columns]
df = df[existing_cols]
# 确保“日期”列为datetime类型
if '日期' in df.columns:
df['日期'] = pd.to_datetime(df['日期'], errors='coerce')
all_data.append(df)
# 合并所有数据
merged_df = pd.concat(all_data, ignore_index=True)
# 去除日期为空的行
merged_df.dropna(subset=['日期'], inplace=True)
# 按日期从小到大排序
merged_df.sort_values(by='日期', ascending=True, inplace=True)
# 重置索引
merged_df.reset_index(drop=True, inplace=True)
# 保存路径设为桌面
output_path = r"C:/Users/12203/Desktop/merged_air_quality_data2.xlsx"
# 判断文件格式并保存
if output_path.endswith('.csv'):
merged_df.to_csv(output_path, index=False, encoding='utf-8-sig')
else:
merged_df.to_excel(output_path, index=False)
print(f"数据已成功合并并保存至: {output_path}")以上代码出现
ModuleNotFoundError Traceback (most recent call last)
File D:\anaconda3\Lib\site-packages\pandas\compat\_optional.py:135, in import_optional_dependency(name, extra, errors, min_version)
134 try:
--> 135 module = importlib.import_module(name)
136 except ImportError:
File D:\anaconda3\Lib\importlib\__init__.py:90, in import_module(name, package)
89 level += 1
---> 90 return _bootstrap._gcd_import(name[level:], package, level)
File <frozen importlib._bootstrap>:1387, in _gcd_import(name, package, level)
File <frozen importlib._bootstrap>:1360, in _find_and_load(name, import_)
File <frozen importlib._bootstrap>:1324, in _find_and_load_unlocked(name, import_)
ModuleNotFoundError: No module named 'xlrd'
During handling of the above exception, another exception occurred:
ImportError Traceback (most recent call last)
Cell In[6], line 20
18 df = pd.read_csv(file)
19 else:
---> 20 df = pd.read_excel(file)
22 # 将列名标准化:去空格、转小写、去除特殊字符用于映射
23 df.columns = [col.strip().lower().replace('.', '').replace('_', '') for col in df.columns]
File D:\anaconda3\Lib\site-packages\pandas\io\excel\_base.py:495, in read_excel(io, sheet_name, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, parse_dates, date_parser, date_format, thousands, decimal, comment, skipfooter, storage_options, dtype_backend, engine_kwargs)
493 if not isinstance(io, ExcelFile):
494 should_close = True
--> 495 io = ExcelFile(
496 io,
497 storage_options=storage_options,
498 engine=engine,
499 engine_kwargs=engine_kwargs,
500 )
501 elif engine and engine != io.engine:
502 raise ValueError(
503 "Engine should not be specified when passing "
504 "an ExcelFile - ExcelFile already has the engine set"
505 )
File D:\anaconda3\Lib\site-packages\pandas\io\excel\_base.py:1567, in ExcelFile.__init__(self, path_or_buffer, engine, storage_options, engine_kwargs)
1564 self.engine = engine
1565 self.storage_options = storage_options
-> 1567 self._reader = self._engines[engine](
1568 self._io,
1569 storage_options=storage_options,
1570 engine_kwargs=engine_kwargs,
1571 )
File D:\anaconda3\Lib\site-packages\pandas\io\excel\_xlrd.py:45, in XlrdReader.__init__(self, filepath_or_buffer, storage_options, engine_kwargs)
33 """
34 Reader using xlrd engine.
35
(...)
42 Arbitrary keyword arguments passed to excel engine.
43 """
44 err_msg = "Install xlrd >= 2.0.1 for xls Excel support"
---> 45 import_optional_dependency("xlrd", extra=err_msg)
46 super().__init__(
47 filepath_or_buffer,
48 storage_options=storage_options,
49 engine_kwargs=engine_kwargs,
50 )
File D:\anaconda3\Lib\site-packages\pandas\compat\_optional.py:138, in import_optional_dependency(name, extra, errors, min_version)
136 except ImportError:
137 if errors == "raise":
--> 138 raise ImportError(msg)
139 return None
141 # Handle submodules: if we have submodule, grab parent module from sys.modules
ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 2.0.1 for xls Excel support Use pip or conda to install xlrd.
1 数据增强
最新发布