遍历一个网页的所有链接,跟网上学的
import re
import urllib.request
import urllib
from collections import deque
que = deque()
vis = set()
url = 'http://news.dbanotes.net/'
que.append(url)
cnt = 0
li = []
f = open('G:/1.txt', 'w')
while que:
url = que.popleft()
vis |= {url}
urlopen = urllib.request.urlopen(url)
if 'html' not in urlopen.getheader('Content-Type'):
continue
try:
data = urlopen.read().decode('utf-8')
except:
continue
r = r'href=\"(.+?)\"'
com = re.compile(r)
ans = com.findall(data)
for i in ans:
if i not in vis and 'http' in i:
que.append(i)
f.write(i)
f.write('\n')
f.close()
76万+

被折叠的 条评论
为什么被折叠?



