本文将介绍地理空间数据质量评估的关键指标、常见质量问题及自动化清洗方法,并提供实用的Python代码实现。

一、数据质量评估指标

1.1 空间数据完整性检查
import geopandas as gpd
import numpy as np

def check_completeness(gdf):
    """评估空间数据完整性"""
    results = {
        'total_features': len(gdf),
        'missing_geometry': gdf.geometry.isnull().sum(),
        'empty_geometry': gdf.geometry.is_empty.sum(),
        'invalid_geometry': (~gdf.geometry.is_valid).sum()
    }
    results['completeness_score'] = 1 - (
        results['missing_geometry'] + 
        results['empty_geometry'] + 
        results['invalid_geometry']
    ) / results['total_features']
    return results

# 使用示例
if __name__ == "__main__":
    data = gpd.read_file("roads.shp")
    completeness = check_completeness(data)
    print("数据完整性评估:")
    for k, v in completeness.items():
        print(f"{k}: {v}")
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
1.2 位置精度评估
from scipy.spatial import distance

def positional_accuracy(gdf, reference_gdf, buffer_distance=10):
    """评估位置精度(需要参考数据)"""
    matched = 0
    distances = []
    
    for idx, geom in enumerate(gdf.geometry):
        # 在参考数据中查找最近要素
        ref_distances = reference_gdf.distance(geom)
        min_idx = ref_distances.idxmin()
        min_dist = ref_distances[min_idx]
        
        if min_dist <= buffer_distance:
            matched += 1
            distances.append(min_dist)
    
    accuracy = {
        'matched_features': matched,
        'match_rate': matched / len(gdf),
        'mean_distance': np.mean(distances) if distances else None,
        'std_distance': np.std(distances) if distances else None
    }
    return accuracy

# 使用示例
if __name__ == "__main__":
    test_data = gpd.read_file("test_roads.shp")
    ref_data = gpd.read_file("ref_roads.shp")
    accuracy = positional_accuracy(test_data, ref_data)
    print("\n位置精度评估:")
    for k, v in accuracy.items():
        print(f"{k}: {v}")
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.

二、常见质量问题修复

2.1 几何有效性修复
from shapely.validation import make_valid

def fix_invalid_geometries(gdf):
    """修复无效几何"""
    fixed_gdf = gdf.copy()
    for idx, row in fixed_gdf.iterrows():
        if not row.geometry.is_valid:
            fixed_gdf.at[idx, 'geometry'] = make_valid(row.geometry)
    return fixed_gdf

# 使用示例
if __name__ == "__main__":
    invalid_data = gpd.read_file("invalid_polygons.shp")
    print(f"修复前无效几何数量: {sum(~invalid_data.geometry.is_valid)}")
    
    fixed_data = fix_invalid_geometries(invalid_data)
    print(f"修复后无效几何数量: {sum(~fixed_data.geometry.is_valid)}")
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
2.2 拓扑错误修正
def fix_topological_errors(gdf, buffer_size=0.0001):
    """修正常见拓扑错误"""
    cleaned_gdf = gdf.copy()
    
    # 处理自相交
    cleaned_gdf.geometry = cleaned_gdf.buffer(0)
    
    # 处理缝隙和小孔洞
    for idx, geom in enumerate(cleaned_gdf.geometry):
        if geom.geom_type == 'Polygon':
            # 移除小孔洞
            if len(geom.interiors) > 0:
                new_interiors = []
                for interior in geom.interiors:
                    poly = Polygon(interior)
                    if poly.area >= buffer_size:
                        new_interiors.append(interior)
                cleaned_gdf.at[idx, 'geometry'] = Polygon(geom.exterior, new_interiors)
    
    return cleaned_gdf

# 使用示例
if __name__ == "__main__":
    dirty_data = gpd.read_file("dirty_polygons.shp")
    cleaned_data = fix_topological_errors(dirty_data)
    cleaned_data.to_file("cleaned_polygons.shp")
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.

三、属性数据质量控制

3.1 属性一致性检查
import pandas as pd

def check_attribute_consistency(gdf, rules):
    """检查属性数据一致性"""
    violations = []
    
    for col, rule in rules.items():
        if rule['type'] == 'range':
            mask = ~gdf[col].between(rule['min'], rule['max'])
            violations.extend(gdf[mask].index.tolist())
        elif rule['type'] == 'categorical':
            mask = ~gdf[col].isin(rule['values'])
            violations.extend(gdf[mask].index.tolist())
        elif rule['type'] == 'not_null':
            mask = gdf[col].isnull()
            violations.extend(gdf[mask].index.tolist())
    
    return list(set(violations))  # 去重

# 使用示例
if __name__ == "__main__":
    data = gpd.read_file("buildings.shp")
    
    validation_rules = {
        'height': {'type': 'range', 'min': 1, 'max': 500},
        'type': {'type': 'categorical', 'values': ['residential', 'commercial', 'industrial']},
        'address': {'type': 'not_null'}
    }
    
    invalid_ids = check_attribute_consistency(data, validation_rules)
    print(f"发现{len(invalid_ids)}条违反规则的记录")
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
3.2 属性数据修复
def clean_attributes(gdf, rules):
    """根据规则修复属性数据"""
    cleaned_gdf = gdf.copy()
    
    for col, rule in rules.items():
        if rule['type'] == 'range':
            # 超出范围的值替换为边界值
            cleaned_gdf[col] = cleaned_gdf[col].clip(rule['min'], rule['max'])
        elif rule['type'] == 'categorical':
            # 无效类别替换为默认值
            cleaned_gdf[col] = cleaned_gdf[col].where(
                cleaned_gdf[col].isin(rule['values']), 
                rule['default']
            )
        elif rule['type'] == 'fill_null':
            # 填充空值
            cleaned_gdf[col] = cleaned_gdf[col].fillna(rule['value'])
    
    return cleaned_gdf

# 使用示例
if __name__ == "__main__":
    data = gpd.read_file("parcels.shp")
    
    cleaning_rules = {
        'area': {'type': 'range', 'min': 10, 'max': 10000},
        'land_use': {'type': 'categorical', 'values': ['residential', 'commercial', 'park'], 'default': 'residential'},
        'owner': {'type': 'fill_null', 'value': 'UNKNOWN'}
    }
    
    cleaned_data = clean_attributes(data, cleaning_rules)
    cleaned_data.to_file("cleaned_parcels.shp")
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.

四、自动化质量控制流程

4.1 综合质量评估报告
def generate_quality_report(gdf, reference_gdf=None):
    """生成综合质量评估报告"""
    report = {}
    
    # 完整性评估
    report.update(check_completeness(gdf))
    
    # 位置精度评估(如果有参考数据)
    if reference_gdf is not None:
        report.update(positional_accuracy(gdf, reference_gdf))
    
    # 几何有效性检查
    report['valid_geometry_rate'] = gdf.geometry.is_valid.mean()
    
    # 属性完整性检查
    attr_completeness = {}
    for col in gdf.columns:
        if col != 'geometry':
            attr_completeness[col] = 1 - gdf[col].isnull().mean()
    report['attribute_completeness'] = attr_completeness
    
    return report

# 使用示例
if __name__ == "__main__":
    data = gpd.read_file("network.shp")
    ref_data = gpd.read_file("ref_network.shp")
    
    report = generate_quality_report(data, ref_data)
    print("\n综合质量评估报告:")
    for k, v in report.items():
        print(f"{k}: {v}")
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
4.2 自动化清洗流程
def automated_cleaning_pipeline(gdf, reference_gdf=None, cleaning_rules=None):
    """自动化数据清洗流程"""
    # 1. 修复几何错误
    cleaned = fix_invalid_geometries(gdf)
    cleaned = fix_topological_errors(cleaned)
    
    # 2. 清洗属性数据
    if cleaning_rules:
        cleaned = clean_attributes(cleaned, cleaning_rules)
    
    # 3. 生成质量报告
    report = generate_quality_report(cleaned, reference_gdf)
    
    return cleaned, report

# 使用示例
if __name__ == "__main__":
    raw_data = gpd.read_file("raw_data.shp")
    ref_data = gpd.read_file("reference_data.shp")
    
    rules = {
        'value': {'type': 'range', 'min': 0, 'max': 100},
        'category': {'type': 'categorical', 'values': ['A', 'B', 'C'], 'default': 'A'}
    }
    
    cleaned_data, report = automated_cleaning_pipeline(
        raw_data, 
        reference_gdf=ref_data,
        cleaning_rules=rules
    )
    
    cleaned_data.to_file("cleaned_data.shp")
    print("\n清洗完成,质量报告:")
    print(report)
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.

五、总结

本文介绍了地理空间数据质量管理的完整流程:

  1. 质量评估 - 完整性、位置精度、属性一致性等指标
  2. 几何修复 - 有效性检查、拓扑错误修正
  3. 属性清洗 - 范围检查、类别验证、空值处理
  4. 自动化流程 - 综合质量报告和一站式清洗

实际应用中应根据数据类型和使用场景调整质量标准和清洗策略。定期进行数据质量评估可确保GIS分析的可靠性。