1. 爬取购物网站商品页面
url = "http://item.jd.com/2967929.html"
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text)
except:
print("爬取失败")
2. 修改user-agent
有些网站必须修改user-agent为浏览器才可以爬取
url = "https://www.amazon.cn/gp/product/B01M8L5Z3Y"
try:
kv = {'user-agent': 'Mozilla/5.0'}
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text)
except:
print("爬取失败")
3. 获取搜索引擎结果
try:
kv = {'wd': 'Python'}
r = requests.get("http://www.baidu.com/s", params=kv)
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("爬取失败")
4. 爬取图片并保存
url = "http://image.nationalgeographic.com.cn/2017/0211/20170211061910157.jpg"
root = "D://"
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")