import re
# 输入文件和输出文件路径
input_file = "element_biaozhu.txt"
output_file = "job_link_fromboss.txt"
# 读取文件内容
with open(input_file, "r", encoding="utf-8") as f:
content = f.read()
# 正则匹配 href="/job_detail/xxxx.html"
pattern = r'href="(/job_detail/[^"]+\.html)"'
matches = re.findall(pattern, content)
# 去重并拼接完整链接
base_url = "https://www.zhipin.com"
full_links = [base_url + link for link in set(matches)]
# 保存到输出文件
with open(output_file, "w", encoding="utf-8") as f:
for link in full_links:
f.write(link + "\n")
print(f"提取完成,共 {len(full_links)} 条链接,已保存到 {output_file}")
python正则库关于re的工具调用过滤网页标签元素
最新推荐文章于 2025-12-01 13:49:36 发布
3899

被折叠的 条评论
为什么被折叠?



