import re
import requests
def getlink(url):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36'}
req=requests.get(url,headers=headers)
data=req.text
# print(data)
#根据需求构建正则
pat='(https?://[^\s)";]+\.(\w|/)*)'
link=re.compile(pat).findall(data)
#去重元素
link=list(set(link))
return link
url='https://www.youkuaiyun.com/
'
#获取对应网页中包含的链接网址
linklist=getlink(url)
for link in linklist:
print(link[0])
爬虫:csdn首页的超链接
最新推荐文章于 2025-04-23 11:34:02 发布