包含异常处理的爬虫模板
如果url访问有问题,则返回异常
import requests
def getHtml(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()#检测状态码
r.encoding=r.apparent_encoding
return r.text
except:
return "产生状态码异常" #抛出异常
url1="http://www.baidu.com"
url2="www.baidu.con"
print(getHtml(url1))
print(getHtml(url2))
添加url参数
import requests
def getHtml(url,data):
try:
r=requests.get(url,data,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
print ("爬取有误")
url1="http://www.baidu.com/s"
data={
'wd':'生活'
}
print(getHtml(url1,data))
伪装头部
import requests
def getHtml(url,header):
try:
r=requests.get(url,timeout=30,headers=header)
r.raise_for_status()
r.enconding=r.apparent_encoding
return r.text
except:
print ("爬取有误")
url2="https://www.amazon.cn/dp/B07746N2J9"
header ={
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
"Accept":" text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip,deflate",
"Accept-Language": "zh-CN,zh;q=0.8"
} #伪装头部
print(getHtml(url2,header))
爬取图片
import requests
r=requests.get("https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1542865260406&di=58fd4edef623772af33422d71b8be0ad&imgtype=0&src=http%3A%2F%2Fs6.cdn.deahu.com%2Fshow%2Flfile%2F8E9CDE37D4E5AFBFEB9A6FB2CC594187.jpg")
with open('wallpaper.png','wb')as f: #write binary以二进制读写文件
f.write(r.content)
模拟知乎登录
import requests
headers={
'Cookie':'d_c0="AICk0gizWQ6PTkm_Prxgrk4SpW2vsJ-LFy8=|1539317790"; _zap=51a38663-f556-41e3-a9bb-fc9f051076d4; __utmv=51854390.100--|2=registration_date=20151202=1^3=entry_date=20151202=1; tst=r; __gads=ID=d46da9e018289437:T=1539774107:S=ALNI_MZTBeYxCc0xdsW7aOMfzlBWCX-oBw; _xsrf=5142c5f9-9faf-4231-be03-cc43edf4f953; __utma=51854390.357429689.1539317791.1540120015.1540650169.3; __utmc=51854390; __utmz=51854390.1540650169.3.3.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/people/song-bing-75-75/collections; capsion_ticket="2|1:0|10:1542014542|14:capsion_ticket|44:NThjYTNhMjI2ZTA2NDEzZGJiNTk5MTllYzA1MWRhMjA=|6975888841c5bd8cb4866877d89148937038cf2cd9869bcc7d7e8ed9a74a0f44"; z_c0="2|1:0|10:1542014553|4:z_c0|92:Mi4xVUpwWEFnQUFBQUFBZ0tUU0NMTlpEaVlBQUFCZ0FsVk5XWlRXWEFCME5EaE9JUDNIcHF3ekVfTFRjUEl6end0Zm9n|fee57031f076e80c0d16dc6041df6423bd602e97d17c8633ac1a37a478216dcf"; q_c1=c67f1d517d1540929ea774a2bb66f863|1542183224000|1539317788000; tgw_l7_route=4902c7c12bebebe28366186aba4ffcde',
'Host':'www.zhihu.com',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3602.2 Safari/537.36'
}
r=requests.get('http://www.zhihu.com',headers=headers)
print(r.text)