2020-2-27(111)

需求

获取搜狐网页上所有的连接并且把与篮球有关的内容筛选出来

算法

第一步:
获取搜狐网页所有内容
第二步:
筛选搜狐网页内容中所有有效连接
第三步:
获取每个有效连接网页内容
第四步:
判断内容是否包含篮球
第五步:
将包含篮球内容网页保存下来

代码

#方法一

import requests
import re

#获得网页源码
r = requests.get("http://www.sohu.com")
content = r.text
with open("d:\\2019\\sohu\\first_page.html","w",encoding="utf-8") as fp:
    fp.write(content)
    
#获取网页上所有url
all_links = re.findall(r'href="(.*?)"',content)
print("全部链接有%s个" %len(all_links))
for link in all_links:
    with open("d:\\2019\\1.txt","a+",encoding="utf-8") as fp:
        fp.write(link+"\n")
print(len(all_links))
#过滤url
valid_links = []   #有效链接
invalid_links = [] #无效链接
for link in all_links:
    if re.search(r'(\.jpg)|(\.jpeg)|(\.png)|(\.css)|(\.js)|(\.ico)|(\.tif)|(\.gif)$',link.strip()):
        #print(1,link)
        invalid_links.append(link)
        continue
    #href="#"这种形式会整体刷新页面
    #href="/"表示链接到网站根目录
    elif link.strip() in ["","#","/"]:
        #print(2,link)
        invalid_links.append(link)
        continue
    elif link.strip().startswith("//"):
        #print(3,link)
        valid_links.append("http:"+link.strip())
        continue
    #javascript:void(0) 仅仅表示一个死链接,没有任何信息。
    elif link.strip().count("javascrip") >= 1 or \
         link.strip().count("mailto:") >= 1:
        #print(4,link)
        invalid_links.append(link.strip())
        continue
    
    elif re.match(r'/\w+',link):
        #print(8,link)
        #http开头连接筛选
        if re.match(r"http://.*?/", resq.url.strip()):
            print(5,link)
            valid_links.append(re.match(r"http://.*?/",r.url.strip()).group() + link.strip())#把连接以/结尾内容保存
        else:
            print(6,link)
            valid_links.append(re.match(r"http://.*", r.url.strip()).group()+ link.strip())#把连接以内容结尾保存
        continue
    
    else:
        #print(7,link)
        valid_links.append(link.strip())

print("有效链接有%s个" %len(valid_links))

#筛选有效链接中包含篮球的网页并保存
file_num = 1
for link in valid_links:
    try:
        r = requests.get(link.strip())
        content = r.text
        charset = r.encoding
        
        if "篮球" in content:
            print(file_num,charset)
            
            with open("d:\\2019\\sohu\\"+str(file_num)+charset+".html","w",encoding=charset) as fp:
                fp.write(content)
            
            file_num += 1
            print("***************************************")
    except Exception as e:
        #print("呀呀呀")
        print(e)

#方法二

import requests
from bs4 import BeautifulSoup
import re

#获得网页源码
r = requests.get("http://www.sohu.com")
soup = BeautifulSoup(r.text,"html.parser")

#获取网页上所有url
def has_href(tag):
    return tag.has_attr('href')

#返回有href属性的标签的列表
#print(soup.find_all(has_href)) 

all_links = [i.get("href") for i in soup.find_all(has_href)]
print("全部链接有%s个" %len(all_links))
#全部链接比第一种用正则的方法少了两个链接
#是因为soup这种方式获得的链接不包含被注释掉的标签
#所以有效链接也少了一个,也是被注释掉了

for link in all_links:
    with open("d:\\2019\\link1.txt","a+") as fp:
        fp.write(link+"\n")
all_links2 = re.findall(r'href="(.*?)"',r.text)
print("全部链接有%s个" %len(all_links2))
for link in all_links2:
    with open("d:\\2019\\link2.txt","a+") as fp:
        fp.write(link+"\n")


#过滤url
valid_links = []
invalid_links = []
for link in all_links:
    if re.search(r'(\.jpg)|(\.jpeg)|(\.png)|(\.css)|(\.js)|(\.ico)|(\.tif)|(\.gif)$',link.strip()):
        invalid_links.append(link.strip())
        continue
    elif link.strip() in ["","#","/"]:
        invalid_links.append(link.strip())
        continue
    elif link.strip().startswith("//"):
        valid_links.append("http:"+link.strip())
        continue
    elif link.strip().count("javascrip") >=1 or link.strip().count("mailto:") >=1:
        invalid_links.append(link.strip())
        continue
    else:
        valid_links.append(link.strip())
print("有效链接有%s个" %len(valid_links))

#遍历有效链接,保存页面有"新冠肺炎"的网页源码
file_num = 1
for link in valid_links:
    r = requests.get(link)
    content = r.text
    charset = r.encoding
    try:
        if "新冠肺炎" in content:
            with open("d:\\2019\\sohu\\"+str(file_num)+charset+".html","w",encoding=charset) as fp:
                fp.write(content)
            file_num += 1
    except:
        print("出错了,即将跳过")
  
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值