Python 使用xpath匹配html内容并生成CSV文件

本文介绍了一种使用Python批量读取HTML文件,并通过lxml库的XPath解析技术来提取标题、内容、来源、时间和作者等关键信息的方法。该方法利用CSV模块将提取到的信息存储为CSV文件,便于进一步的数据处理和分析。
#-- coding: utf-8 --
import os
import re
import csv
from lxml import html
#生成CSV文件
def get_list_dir():
    headers = ('标题', '内容', '来源', '时间', '作者')
    with open('D:/Python/PythonProjects/TestDemo/article/31530942.csv', 'w', encoding='utf-8-sig') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(headers)
    dir_aim = "D:/Python/PythonProjects/TestDemo/article"
    for filename in os.listdir(dir_aim):
        #print(filename)
        article = get_article_content(filename)
        if article:
            with open('D:/Python/PythonProjects/TestDemo/article/31530942.csv', 'a+', encoding='utf-8-sig') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(article)
#读取html文件用xpath解析文件返回结果
def get_article_content(fileName):
    try:
        with open('D:/Python/PythonProjects/TestDemo/article/'+fileName, 'r', encoding='utf-8') as f:
            content = f.read()
            etree = html.etree
            ht = etree.HTML(content)
            title = "".join(ht.xpath("/html/body/div[5]/h1/text()"))
            content = "".join(ht.xpath("/html/body/div[5]/div[2]/p/text()")).replace("\n","")
            source = "".join(ht.xpath("/html/body/div[5]/div[1]/text()"))
            if source:
                source = re.search(r"来源:(.*?)\s",source)
                if source:
                    source = source.group().replace("来源:", "")
                else:
                    source = ""
            else:
                source = ""
            time = "".join(ht.xpath("/html/body/div[5]/div[1]/text()"))
            if time:
                time = re.search(r"时间:(.*?)$", time)
                if time:
                    time = time.group().replace("时间:", "")
                else:
                    time = ""
            else:
                time = ""
            edit = "".join(ht.xpath("/html/body/div[5]/div[3]/text()"))
            if edit:
                edit = re.search(r"责编:(.*?)\s",edit)
                if edit:
                    edit = edit.group().replace("责编:", "")
                else:
                    edit = ""
            else:
                edit =""
            return (title, content, source, time, edit)
    except Exception as e:
        print(e)
get_list_dir()

a+ 是追加方式写入

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值