import urllib.request import random #向指定url发送请求,并返回文件对象 response=urllib.request.urlopen("http://www.baidu.com") #data=response.read() #print(data) #<class 'bytes'> #print(type(data)) #将爬取的内容写入文件 #with open('hello.html','wb') as f: # f.write(data) data=response.readlines() print(data) #<class 'list'> print(type(data)) print(data[38].decode('utf-8')) #response的属性 #info返回当前环境信息 print(response.info()) #返回状态码200,304缓存 print(response.getcode()) #返回路由 print(response.geturl()) #url汉字会被进行编码 print(urllib.request.quote('百度')) #解码unquote print(urllib.request.unquote('%E7%99%BE%E5%BA%A6')) #直接写入文件 #urllib.request.urlretrieve('http://baidu.com','hello11.html') #清除缓存 #urllib.request.urlcleanup() #模拟浏览器 #模拟请求头 headers={ 'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/49.0.2623.112 Safari/537.36" } #设置请求体 req=urllib.request.Request('http://www.baidu.com',headers=headers) response=urllib.request.urlopen(req) data=response.readlines() print(data) agentlist=[ "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50", "Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1", "Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11", "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11", ] agentStr=random.choice(agentlist) req=urllib.request.Request("http://127.0.0.1:8000/app/market/103536/0/0/") req.add_header('User-Agent',agentStr) response=urllib.request.urlopen(req) print(response.read().decode('utf-8'))
#如果网页长时间未响应,系统判断超时,爬下一个 url="https://blog.youkuaiyun.com/weixin_42141853/article/details/80552476" for i in range(1,30): try: response=urllib.request.urlopen(url,timeout=0.5) str=response.read().decode('utf-8') print(str) print(len(str)) except: print("请求超时!")
#如果网页长时间未响应,系统判断超时,爬下一个 url="https://blog.youkuaiyun.com/weixin_42141853/article/details/80552476" for i in range(1,30): try: response=urllib.request.urlopen(url,timeout=0.5) str=response.read().decode('utf-8') print(str) print(len(str)) except: print("请求超时!")