from urllib import request #导入urllib 下的request
import random
def spider(url):
#配置多个浏览器
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
]
user_agent = random.choice(user_agent_list) # choice的作用是在user_agent_list 中挑选一个作为参数
print(user_agent,url)
#请求头
headers = {
"User-Agent":user_agent
}
#向服务器发起请求
req = request.Request(url,headers=headers)
# print(html)
#进行切片为文件起名字
l = url.split("/")
fileName = "05_"+l[-1]
#下载网页源码进行文件写入
with open(fileName,"w",encoding="utf-8") as f:
f.write(html)
if __name__ == "__main__":
url_list = ["http://www.langlang2017.com/index.html","http://www.langlang2017.com/route.html","http://www.langlang2017.com/FAQ.html"]
for url in url_list:
spider(url)
import random
def spider(url):
#配置多个浏览器
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
]
user_agent = random.choice(user_agent_list) # choice的作用是在user_agent_list 中挑选一个作为参数
print(user_agent,url)
#请求头
headers = {
"User-Agent":user_agent
}
#向服务器发起请求
req = request.Request(url,headers=headers)
response = request.urlopen(req)
#网页解析
html = response.read()
#网页转码
# print(html)
#进行切片为文件起名字
l = url.split("/")
fileName = "05_"+l[-1]
#下载网页源码进行文件写入
with open(fileName,"w",encoding="utf-8") as f:
f.write(html)
if __name__ == "__main__":
url_list = ["http://www.langlang2017.com/index.html","http://www.langlang2017.com/route.html","http://www.langlang2017.com/FAQ.html"]
for url in url_list:
spider(url)