本文将介绍地理空间数据质量评估的关键指标、常见质量问题及自动化清洗方法,并提供实用的Python代码实现。
一、数据质量评估指标
1.1 空间数据完整性检查
import geopandas as gpd
import numpy as np
def check_completeness(gdf):
"""评估空间数据完整性"""
results = {
'total_features': len(gdf),
'missing_geometry': gdf.geometry.isnull().sum(),
'empty_geometry': gdf.geometry.is_empty.sum(),
'invalid_geometry': (~gdf.geometry.is_valid).sum()
}
results['completeness_score'] = 1 - (
results['missing_geometry'] +
results['empty_geometry'] +
results['invalid_geometry']
) / results['total_features']
return results
# 使用示例
if __name__ == "__main__":
data = gpd.read_file("roads.shp")
completeness = check_completeness(data)
print("数据完整性评估:")
for k, v in completeness.items():
print(f"{k}: {v}")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
1.2 位置精度评估
from scipy.spatial import distance
def positional_accuracy(gdf, reference_gdf, buffer_distance=10):
"""评估位置精度(需要参考数据)"""
matched = 0
distances = []
for idx, geom in enumerate(gdf.geometry):
# 在参考数据中查找最近要素
ref_distances = reference_gdf.distance(geom)
min_idx = ref_distances.idxmin()
min_dist = ref_distances[min_idx]
if min_dist <= buffer_distance:
matched += 1
distances.append(min_dist)
accuracy = {
'matched_features': matched,
'match_rate': matched / len(gdf),
'mean_distance': np.mean(distances) if distances else None,
'std_distance': np.std(distances) if distances else None
}
return accuracy
# 使用示例
if __name__ == "__main__":
test_data = gpd.read_file("test_roads.shp")
ref_data = gpd.read_file("ref_roads.shp")
accuracy = positional_accuracy(test_data, ref_data)
print("\n位置精度评估:")
for k, v in accuracy.items():
print(f"{k}: {v}")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
二、常见质量问题修复
2.1 几何有效性修复
from shapely.validation import make_valid
def fix_invalid_geometries(gdf):
"""修复无效几何"""
fixed_gdf = gdf.copy()
for idx, row in fixed_gdf.iterrows():
if not row.geometry.is_valid:
fixed_gdf.at[idx, 'geometry'] = make_valid(row.geometry)
return fixed_gdf
# 使用示例
if __name__ == "__main__":
invalid_data = gpd.read_file("invalid_polygons.shp")
print(f"修复前无效几何数量: {sum(~invalid_data.geometry.is_valid)}")
fixed_data = fix_invalid_geometries(invalid_data)
print(f"修复后无效几何数量: {sum(~fixed_data.geometry.is_valid)}")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
2.2 拓扑错误修正
def fix_topological_errors(gdf, buffer_size=0.0001):
"""修正常见拓扑错误"""
cleaned_gdf = gdf.copy()
# 处理自相交
cleaned_gdf.geometry = cleaned_gdf.buffer(0)
# 处理缝隙和小孔洞
for idx, geom in enumerate(cleaned_gdf.geometry):
if geom.geom_type == 'Polygon':
# 移除小孔洞
if len(geom.interiors) > 0:
new_interiors = []
for interior in geom.interiors:
poly = Polygon(interior)
if poly.area >= buffer_size:
new_interiors.append(interior)
cleaned_gdf.at[idx, 'geometry'] = Polygon(geom.exterior, new_interiors)
return cleaned_gdf
# 使用示例
if __name__ == "__main__":
dirty_data = gpd.read_file("dirty_polygons.shp")
cleaned_data = fix_topological_errors(dirty_data)
cleaned_data.to_file("cleaned_polygons.shp")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
三、属性数据质量控制
3.1 属性一致性检查
import pandas as pd
def check_attribute_consistency(gdf, rules):
"""检查属性数据一致性"""
violations = []
for col, rule in rules.items():
if rule['type'] == 'range':
mask = ~gdf[col].between(rule['min'], rule['max'])
violations.extend(gdf[mask].index.tolist())
elif rule['type'] == 'categorical':
mask = ~gdf[col].isin(rule['values'])
violations.extend(gdf[mask].index.tolist())
elif rule['type'] == 'not_null':
mask = gdf[col].isnull()
violations.extend(gdf[mask].index.tolist())
return list(set(violations)) # 去重
# 使用示例
if __name__ == "__main__":
data = gpd.read_file("buildings.shp")
validation_rules = {
'height': {'type': 'range', 'min': 1, 'max': 500},
'type': {'type': 'categorical', 'values': ['residential', 'commercial', 'industrial']},
'address': {'type': 'not_null'}
}
invalid_ids = check_attribute_consistency(data, validation_rules)
print(f"发现{len(invalid_ids)}条违反规则的记录")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
3.2 属性数据修复
def clean_attributes(gdf, rules):
"""根据规则修复属性数据"""
cleaned_gdf = gdf.copy()
for col, rule in rules.items():
if rule['type'] == 'range':
# 超出范围的值替换为边界值
cleaned_gdf[col] = cleaned_gdf[col].clip(rule['min'], rule['max'])
elif rule['type'] == 'categorical':
# 无效类别替换为默认值
cleaned_gdf[col] = cleaned_gdf[col].where(
cleaned_gdf[col].isin(rule['values']),
rule['default']
)
elif rule['type'] == 'fill_null':
# 填充空值
cleaned_gdf[col] = cleaned_gdf[col].fillna(rule['value'])
return cleaned_gdf
# 使用示例
if __name__ == "__main__":
data = gpd.read_file("parcels.shp")
cleaning_rules = {
'area': {'type': 'range', 'min': 10, 'max': 10000},
'land_use': {'type': 'categorical', 'values': ['residential', 'commercial', 'park'], 'default': 'residential'},
'owner': {'type': 'fill_null', 'value': 'UNKNOWN'}
}
cleaned_data = clean_attributes(data, cleaning_rules)
cleaned_data.to_file("cleaned_parcels.shp")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
四、自动化质量控制流程
4.1 综合质量评估报告
def generate_quality_report(gdf, reference_gdf=None):
"""生成综合质量评估报告"""
report = {}
# 完整性评估
report.update(check_completeness(gdf))
# 位置精度评估(如果有参考数据)
if reference_gdf is not None:
report.update(positional_accuracy(gdf, reference_gdf))
# 几何有效性检查
report['valid_geometry_rate'] = gdf.geometry.is_valid.mean()
# 属性完整性检查
attr_completeness = {}
for col in gdf.columns:
if col != 'geometry':
attr_completeness[col] = 1 - gdf[col].isnull().mean()
report['attribute_completeness'] = attr_completeness
return report
# 使用示例
if __name__ == "__main__":
data = gpd.read_file("network.shp")
ref_data = gpd.read_file("ref_network.shp")
report = generate_quality_report(data, ref_data)
print("\n综合质量评估报告:")
for k, v in report.items():
print(f"{k}: {v}")
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
4.2 自动化清洗流程
def automated_cleaning_pipeline(gdf, reference_gdf=None, cleaning_rules=None):
"""自动化数据清洗流程"""
# 1. 修复几何错误
cleaned = fix_invalid_geometries(gdf)
cleaned = fix_topological_errors(cleaned)
# 2. 清洗属性数据
if cleaning_rules:
cleaned = clean_attributes(cleaned, cleaning_rules)
# 3. 生成质量报告
report = generate_quality_report(cleaned, reference_gdf)
return cleaned, report
# 使用示例
if __name__ == "__main__":
raw_data = gpd.read_file("raw_data.shp")
ref_data = gpd.read_file("reference_data.shp")
rules = {
'value': {'type': 'range', 'min': 0, 'max': 100},
'category': {'type': 'categorical', 'values': ['A', 'B', 'C'], 'default': 'A'}
}
cleaned_data, report = automated_cleaning_pipeline(
raw_data,
reference_gdf=ref_data,
cleaning_rules=rules
)
cleaned_data.to_file("cleaned_data.shp")
print("\n清洗完成,质量报告:")
print(report)
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
五、总结
本文介绍了地理空间数据质量管理的完整流程:
- 质量评估 - 完整性、位置精度、属性一致性等指标
- 几何修复 - 有效性检查、拓扑错误修正
- 属性清洗 - 范围检查、类别验证、空值处理
- 自动化流程 - 综合质量报告和一站式清洗
实际应用中应根据数据类型和使用场景调整质量标准和清洗策略。定期进行数据质量评估可确保GIS分析的可靠性。