写一个爬虫,实现外链间的随机跳转。
若某页面没有外链,则随机跳转到一个内链,然后继续收集外链。
代码如下:
from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random
pages = set()
random.seed(datetime.datetime.now())
def getInternalLinks(bsObj, includeurl):
includeurl = urlparse(includeurl).scheme+"://"+urlparse(includeurl).netloc
internalLinks = []
for link in bsObj.findAll("a",href = re.compile("^(/|.*" + includeurl + ")")):
if link.attrs["href"] is not None:
if link.attrs["href"] not in internalLinks:
if(link.attrs["href"].startswith("/")):
internalLinks.append(includeurl + link.attrs["href"])
else:
internalLinks.append(link.attrs["href"])
return internalLinks
def getExternalLinks(bsObj, excludeurl):
externalLinks = []
for link in bsObj.findAll("a",href = re.compile("^(http|www)((?!"+excludeurl+").)*$")):
if link.attrs["href"] is not None:
if link.attrs["href"] not in externalLinks:
externalLinks.append(link.attrs["href"])
return externalLinks
def splitAddress(address):
addressParts = address.replace("http://", "").split("/")
return addressParts
def getRandomExternalLink(startingPage):
html = urlopen(startingPage)
bsObj = BeautifulSoup(html)
externalLinks = getExternalLinks(bsObj, urlparse(startingPage).netloc)
if len(externalLinks) == 0:
print("No external links, looking around the site for one")
domain = urlparse(startingPage).scheme+"://"+urlparse(startingPage).netloc
internalLinks = getInternalLinks(bsObj, domain)
if len(internalLinks) == 0:
return None
else:
return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
else:
return externalLinks[random.randint(0,len(externalLinks)-1)]
def followExternalOnly(startingSite):
externalLink = getRandomExternalLink(startingSite)
if externalLink is None:
print("Over")
return None;
else:
print("Random external link is: "+externalLink)
followExternalOnly(externalLink)
followExternalOnly("http://www.baidu.com")