需求
获取搜狐网页上所有的连接并且把与篮球有关的内容筛选出来
算法
第一步:
获取搜狐网页所有内容
第二步:
筛选搜狐网页内容中所有有效连接
第三步:
获取每个有效连接网页内容
第四步:
判断内容是否包含篮球
第五步:
将包含篮球内容网页保存下来
代码
#方法一
import requests
import re
#获得网页源码
r = requests.get("http://www.sohu.com")
content = r.text
with open("d:\\2019\\sohu\\first_page.html","w",encoding="utf-8") as fp:
fp.write(content)
#获取网页上所有url
all_links = re.findall(r'href="(.*?)"',content)
print("全部链接有%s个" %len(all_links))
for link in all_links:
with open("d:\\2019\\1.txt","a+",encoding="utf-8") as fp:
fp.write(link+"\n")
print(len(all_links))
#过滤url
valid_links = [] #有效链接
invalid_links = [] #无效链接
for link in all_links:
if re.search(r'(\.jpg)|(\.jpeg)|(\.png)|(\.css)|(\.js)|(\.ico)|(\.tif)|(\.gif)$',link.strip()):
#print(1,link)
invalid_links.append(link)
continue
#href="#"这种形式会整体刷新页面
#href="/"表示链接到网站根目录
elif link.strip() in ["","#","/"]:
#print(2,link)
invalid_links.append(link)
continue
elif link.strip().startswith("//"):
#print(3,link)
valid_links.append("http:"+link.strip())
continue
#javascript:void(0) 仅仅表示一个死链接,没有任何信息。
elif link.strip().count("javascrip") >= 1 or \
link.strip().count("mailto:") >= 1:
#print(4,link)
invalid_links.append(link.strip())
continue
elif re.match(r'/\w+',link):
#print(8,link)
#http开头连接筛选
if re.match(r"http://.*?/", resq.url.strip()):
print(5,link)
valid_links.append(re.match(r"http://.*?/",r.url.strip()).group() + link.strip())#把连接以/结尾内容保存
else:
print(6,link)
valid_links.append(re.match(r"http://.*", r.url.strip()).group()+ link.strip())#把连接以内容结尾保存
continue
else:
#print(7,link)
valid_links.append(link.strip())
print("有效链接有%s个" %len(valid_links))
#筛选有效链接中包含篮球的网页并保存
file_num = 1
for link in valid_links:
try:
r = requests.get(link.strip())
content = r.text
charset = r.encoding
if "篮球" in content:
print(file_num,charset)
with open("d:\\2019\\sohu\\"+str(file_num)+charset+".html","w",encoding=charset) as fp:
fp.write(content)
file_num += 1
print("***************************************")
except Exception as e:
#print("呀呀呀")
print(e)
#方法二
import requests
from bs4 import BeautifulSoup
import re
#获得网页源码
r = requests.get("http://www.sohu.com")
soup = BeautifulSoup(r.text,"html.parser")
#获取网页上所有url
def has_href(tag):
return tag.has_attr('href')
#返回有href属性的标签的列表
#print(soup.find_all(has_href))
all_links = [i.get("href") for i in soup.find_all(has_href)]
print("全部链接有%s个" %len(all_links))
#全部链接比第一种用正则的方法少了两个链接
#是因为soup这种方式获得的链接不包含被注释掉的标签
#所以有效链接也少了一个,也是被注释掉了
for link in all_links:
with open("d:\\2019\\link1.txt","a+") as fp:
fp.write(link+"\n")
all_links2 = re.findall(r'href="(.*?)"',r.text)
print("全部链接有%s个" %len(all_links2))
for link in all_links2:
with open("d:\\2019\\link2.txt","a+") as fp:
fp.write(link+"\n")
#过滤url
valid_links = []
invalid_links = []
for link in all_links:
if re.search(r'(\.jpg)|(\.jpeg)|(\.png)|(\.css)|(\.js)|(\.ico)|(\.tif)|(\.gif)$',link.strip()):
invalid_links.append(link.strip())
continue
elif link.strip() in ["","#","/"]:
invalid_links.append(link.strip())
continue
elif link.strip().startswith("//"):
valid_links.append("http:"+link.strip())
continue
elif link.strip().count("javascrip") >=1 or link.strip().count("mailto:") >=1:
invalid_links.append(link.strip())
continue
else:
valid_links.append(link.strip())
print("有效链接有%s个" %len(valid_links))
#遍历有效链接,保存页面有"新冠肺炎"的网页源码
file_num = 1
for link in valid_links:
r = requests.get(link)
content = r.text
charset = r.encoding
try:
if "新冠肺炎" in content:
with open("d:\\2019\\sohu\\"+str(file_num)+charset+".html","w",encoding=charset) as fp:
fp.write(content)
file_num += 1
except:
print("出错了,即将跳过")