python代码如下
import re
# 定义每次读取的块大小
block_size = 1024 * 1024 * 3000 # 1MB
# 读取txt文件内容
with open('xxx_1.txt', 'r', encoding='utf-8') as file:
while True:
# 读取一块内容
content = file.read(block_size)
if not content:
break
# 使用正则表达式去除HTML标签和style样式
content = re.sub(r'<style.*?>.*?</style>', '', content, flags=re.DOTALL | re.IGNORECASE) # 去除style标签
content = re.sub(r'<.*?>', '', content, flags=re.DOTALL | re.IGNORECASE | re.MULTILINE) # 去除其他HTML标签
# 将处理后的内容写入新的txt文件
with open('xxx_out_1.txt', 'a', encoding='utf-8') as output_file:
output_file.write(content)