源自人工智能一次作业中的一道题
本人Python小白啊,真的要好好努力了。话不多说,直接上代码。
#encoding:UTF-8
import time
from lxml import etree
import urllib2
import Queue
import requests
requests.packages.urllib3.disable_warnings()
def findUrl(url): # 查找Url对应页面包含的子url
try:
links = []
res = requests.get(url, timeout=0.6) # 获取requests发出get请求后的response对象
html = etree.HTML(res.text) # etree解析响应体的字符串形式
newurls = html.findall('.//a') # 找到所有的a标签
for newURL in newurls:
href = newURL.get('href') # 获取href链接
links.append(href)
return links
except Exception as e:
pass
def printPath(parents, startUrl, targetUrl): # 寻找从初始url到目标url的路径
try:
print "\nThe path from %s to %s: " % (startUrl, targetUrl)
path = [targetUrl]
parent = parents[targetUrl] # 寻找父亲
while bool(parent) == True:
path.append(parent)
parent = parents[parent]
path = path[::-1] # 列表反转
print "\n-> ".join(path)
except Exception as e:
pass
def search(startUrl, targetUrl):
queue = Queue.Queue() # 队列 存储 未访问的url
visited = set() # 集合 存储 访问过的url
parents = dict() # 字典 存储 父url
parents[startUrl] = None # 起始url为祖先
queue.put(startUrl) # BFS开始前先将源url推进队列
visited.add(startUrl) # 初始节点标记访问
while (queue.empty() == False): # 队列非空
try:
curentUrl = queue.get() # 取出队列首部并pop掉
print('search in %s ...' % curentUrl)
# 找出当前url的所有子url
urlLink = findUrl(curentUrl)
if urlLink:
for url in urlLink:
parents[url] = curentUrl # 记录当前url与子url的对应关系
if (url == targetUrl):
print('find %s successfully\n' % targetUrl)
printPath(parents, startUrl, targetUrl) # 打印路径
return
if (url not in visited): # 如果子url还未访问过,推进队列并标记访问
queue.put(url)
visited.add(url)
except Exception as e:
pass
if __name__ == '__main__':
startTime = time.time()
search('http://helpdesk.sysu.edu.cn/', 'http://tv.sysu.edu.cn/')
print ("\nCost time: %f s" % (time.time() - startTime))
查找过程如下:
查询结果如下:
提示成功找到目标url, 打印最短路径,并显示查询时间。