eg:
SSL Proxying Settings
https://www.charlesproxy.com/
https://www.jianshu.com/p/5d38d552fa52 SSL Proxying Settings
import urllib.request
#向指定url地址发起请求,并返回服务器响应的数据(文件的对象)
response = urllib.request.urlopen(“http://www.baidu.com”)
#data =response.read()
#print(data)
将爬取到的文件写入文件,会把读取到的数据赋值给一个字符串变量
#with open(r"H:\study\file1.html", "wb") as f:
# f.write(data)
读取一行
data = response.readline()
读取文件的全部内容,会把读取到的数据赋值给一个列表变量
data = response.readlines()
print(data)
response 属性
返回当前环境的有关信息
print(response.info())
返回状态码
print(response.getcode())
if response.getcode() == 200 || response.getcode == 304:
#处理信息
pass
返回当前爬取的URL地址
print(response.geturl())
解码
url = "https://www.baidu.com/"
将爬取到的网页直接写入文件
urllib.request.urlretrieve("http://www/baidu.com")
filename = (r"H:\study\file1.html")
执行的过程中会产生一些缓存
清除缓存
urllib.request.urlcleanup()
Url = urllib.request.unquote(url)
模拟浏览器
url = “http://www.baidu.com”
headers = {
“User-Agnet”: “Mozilla/5.0 (Windows NT 10.0;Win64; x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36”
}
#设置一个请求体
req = urllib.request.Request(url,headers=headers)
#发起请求
respose = urllib.request.urlopen(req)
data = respose.read().decode("utf-8")
print(data)
反反爬虫:
import urllib.request
import random
#模拟浏览器
url = "http://www.baidu.com"
agentStr = [
"User-Agent:Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"User-Agent:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
]
agentStr = random.choice(agentStr)
req = urllib.request.Request(url)
#请求体里添加了User-Agent
req.add_header("User-agent", agentStr)
respose = urllib.request.urlopen(req)
print(respose.read().decode("utf-8"))
设置超时.
如果网页长时间未响应系统判断超时,无法爬取
for i in range(1, 100):
try:
response = urllib.request.urlopen("http://www.baidu.com", timeout=0.00005)
print(len(response.read().decode("utf-8")))
except:
print("ERROR")
HTTP请求:
进行客户端和服务端之间的消息传递的时候使用
GET:通过url网址传递信息,可以直接在URL网址上添加要传递的信息,但是不安全
POSR:可以向服务器提交数据,是一种比较流行的并且安全的数据床底方式
PUT:请求服务器存储一个资源。通常要指定存储的位置
DELETE:请求服务器删除一个资源
HEAD:请求获取对应的HTTP报头信息
OPTIONS:可以获取当前URL所支持的请求类型
GET:
特点:把数据拼接到请求路径的后面传递给服务器
优点:速度快
缺点:承载数据量小,并且不安全
import urllib.request
url = "http://www.baidu.com"
respose = urllib.request.urlopen(url)
data = respose.read().decode("utf-8")
print (data)
POST:
特点:把参数进行打包单独传输
优点:数据量大,安全,
缺点:速度慢
import urllib.request
import urllib.parser
url = "http://www.test.com:8080/index"
data = {
"username":"tom"
"passwd":"123"
}
对要发送的数据进行打包
postData = urllib.parse.urlencode(data)
req = urllib.request.Request(url,data=postData)
抓取动态Ajax请求得数据:
import urllib.request
import ssl
import json
def ajaxCrawler(url):
headers = {
"User-Agent","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"
}
req = urllib.request.Request(url, headers=headers)
#使用ssl创建未验证得上下文
context = ssl._create_default_https_context()
response = urllib.request.urlopen(req,context)
jsonStr = response.read().decode("utf-8")
jsonData = json.loads(url)
return jsonData
url = "https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start=0"
info = ajaxCrawler(url)
print(info)
for i in (1,10):
url = "https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&page_limit=50&page_start="+ str(i * 50)
info = ajaxCrawler()
print(len(info))