任务一:新浪微博热搜标题
1>获取网页
from bs4 import BeautifulSoup
import requests
if __name__=="__main__":
target='https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6'
req=requests.get(url=target)
html=req.text
bf=BeautifulSoup(html)
2>获取标签
t=bf.find_all('td',class_='td-02')
3>获取td-02标签下的子标签
for i in t:
first=i.select('a')
4>输出为文本
print(first[0].text)
任务二:先获取小说章节列表,再爬取每个章节的内容
from bs4 import BeautifulSoup
import requests
if __name__ == "__main__":
server = 'http://www.biqukan.com/'
target = 'http://www.biqukan.com/1_1094/'
req = requests.get(url = target)
html = req.text
div_bf = BeautifulSoup(html)
div = div_bf.find_all('div', class_ = 'listmain')
a_bf = BeautifulSoup(str(div[0]))
a = a_bf.find_all('a')
for each in a:
surl=server + each.get('href')
sreq=requests.get(url=surl)
shtml=sreq.text
sbf=BeautifulSoup(shtml)
t=sbf.find_all('div',class_='content')
print(t[0].text)
任务三:京东详细信息
import requests
url="https://item.jd.com/4806709.html"
try:
r=requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[:1000])
except:
print("爬取失败")
任务四:爬取亚马逊商品页面
import requests
url="https://www.amazon.cn/dp/B07896QRWF/ref=sr_1_1?fst=as%3Aoff&m=A1AJ19PSB66TGU&qid=1556456586&refinements=p_6%3AA1AJ19PSB66TGU&rnid=658391051&s=books&sr=1-1"
try:
kv={'user-agent':'Mozilla/5.0'}
r=request.get(url,headers=kv)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[1000:2000])
except:
print("爬取失败")
note:我们查看header发现用户代理是python,我们在这里改为‘Mozilla/5.0’(google浏览器)
任务五:百度搜索关键词
import requests
keyword="Python"
try:
kv={'wd':keywoed}
r=requests.get("http://www.baidu.com/s",params=kv)
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("爬取失败")
附:360获取关键词
import requests
keyword="Python"
try:
kv={'q':keywoed}
r=requests.get("http://www.so.com/s",params=kv)
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("爬取失败")
note:
百度搜索关键词的接口:
http://www.baidu.com/s?wd=“关键词”
360获取关键词的接口:
http://www.so.com/s?kv=“关键词”
任务六:爬取图片
import requests
import os
//url是图片的地址
url=""
root="H://pics//"
path=root+url.split('/')[-1]
try:
//判断根目录是否存在,若不存在,创建一个目录
if not os.path.exists(root):
os.mkdir(root)
//路径不重复
if not os.path.exists(path):
r=requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
任务七:ip地址归属地的自动查询
import requests
url="http://m.ip138.com/ip.asp?ip="
try:
r=requests.get(url+'202.204.80.112')
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[-500:])
except:
print("爬取失败")
任务八:爬取网页中的链接
r=requests.get("http://python123.io/ws/demo.html")
demo=r.text
from bs4 import BeautifulSoup
soup=BeautifulSoup(demo,"html.parser")
soup.a
soup.body
soup.a.parent
soup.a.content
soup.a.children
for link in soup.find_all('a'):
print(link.get('href'))
任务九:爬取大学排名
import requests
from bs4 import BeautifulSoup
import bs4
#获取html页面
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
return ""
'''
提取内容
'''
def fillUnivList(ulist,html):
soup=BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds=tr('td')
ulist.append([tds[0].string,tds[1].string,tds[2].string])
'''
打印大学列表
'''
def printUnivList(ulist,num):
print("{:^10}\t{:^6}\t{:^10}".format("排名","学校","总分"))
for i in range(num):
u=ulist[i]
print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
def main():
uinfo=[]
url='http://www.zuihaodaxue.com/zuihaodaxuepaiming2016.html'
html=getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,20)
main()
本文详细介绍了一系列爬虫技术的实际应用案例,包括抓取微博热搜、小说章节、京东商品详情等,涵盖了网页信息抓取、图片下载及关键词搜索等多种场景。
2364

被折叠的 条评论
为什么被折叠?



