Python--手动解析--80%实现对手机通讯论的名字与号码的 xlsx整理

import pandas as pd
import quopri
import re
from chardet import detect
from opencc import OpenCC

# 初始化 OpenCC 转换器：'t2s' 表示从繁体转简体
cc = OpenCC('t2s')

def to_simplified(text):
    """将文本中的繁体中文转换为简体中文"""
    if not text:
        return ''
    return cc.convert(text)

def decode_quoted_printable(text, charset='utf-8'):
    """
    安全地解码 Quoted-Printable 内容，并尝试多种编码方式
    """
    try:
        decoded = quopri.decodestring(text).decode(charset)
    except UnicodeDecodeError:
        try:
            decoded = quopri.decodestring(text).decode('gbk', errors='replace')
        except Exception:
            decoded = quopri.decodestring(text).decode('latin1', errors='replace')
    return decoded

def parse_vcard_file(file_path):
    contacts = []
    current_name = None

    with open(file_path, 'rb') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            # 尝试自动检测编码
            encoding = detect(line)['encoding'] or 'utf-8'
            try:
                decoded_line = line.decode(encoding, errors='replace')
            except Exception:
                decoded_line = line.decode('latin1', errors='replace')

            # 处理 FN（姓名）
            if decoded_line.lower().startswith('fn;'):
                charset_match = re.search(r'CHARSET=([^;:,]+)', decoded_line, re.IGNORECASE)
                charset = charset_match.group(1).strip() if charset_match else 'utf-8'

                qp_match = re.search(r'ENCODING=QUOTED-PRINTABLE', decoded_line, re.IGNORECASE)
                if qp_match:
                    match = re.search(r'FN(;[^:]*?)?:(.*)', decoded_line, re.IGNORECASE)
                    if match and match.group(2):
                        encoded_value = match.group(2).strip()
                        # 替换下划线为等号（RFC 2045 规定 _ 是 = 的替代）
                        encoded_value = encoded_value.replace('_', '=')
                        decoded_name = decode_quoted_printable(encoded_value, charset)
                        current_name = decoded_name
                else:
                    current_name = decoded_line.split(':', 1)[1].strip()

            elif decoded_line.lower().startswith('fn:'):
                current_name = decoded_line.split(':', 1)[1].strip()

            # 处理 TEL（电话）
            elif decoded_line.lower().startswith('tel;') or decoded_line.lower().startswith('telx;'):
                parts = decoded_line.split(':', 1)
                if len(parts) > 1:
                    phone = parts[1].strip()
                    if phone:
                        simplified_name = to_simplified(current_name) if current_name else "未知联系人"
                        contacts.append({
                            'Name': simplified_name,
                            'Phone': phone
                        })
                        print(f"已添加联系人：{simplified_name} - {phone}")
                        current_name = None  # 每个电话只匹配一个名字

    return contacts

# 文件路径
vcf_path = r"C:\Users\Lenovo\Desktop\00004.vcf"
output_excel = r"C:\Users\Lenovo\Desktop\谢忠发的电话目录.xlsx"

# 解析 vCard 文件
contacts = parse_vcard_file(vcf_path)

# 写入 Excel
df = pd.DataFrame(contacts)
df.to_excel(output_excel, index=False, sheet_name='Contacts')

print(f"成功导出至 {output_excel}")

文字乱码的部分没有调整好，有小伙伴有合适解决方法，欢迎评论区交流--