一.知识点
1.URL:protocol://hostname[:port] /path/[;parameters]
①协议:http ,http , ftp , file, edzk
②服务器域名 或IP地址 或端口号http默认端口号为0
③资源具体地址, 如目录或文件名
通用例子:
import urllib.request
url = (' 一个网站地址 ')
req = urllib.request.Request(url) #请求
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3226.400 QQBrowser/9.6.11681.400')#隐藏 或者 代理三大步骤user-Agent
response = urllib.request.urlopen(url) #响应
html = response.read().decode('utf-8') #解码
④代理三大步骤:
①参数是一个字典:{ '类型' : '代理IP : 端口号'}
②定制,创建一个opener
opener = urllib.request.build(_opener(proxy_support))
③a 安装opener
urllib.request.install_opener(opener)
b 调用opener
opener.open(url)
二.爬虫煎蛋网代码
import urllib.request
def url_open(url):
req = urllib.request.Request(url) #请求
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3226.400 QQBrowser/9.6.11681.400')
response = urllib.request.urlopen(url) #响应
html = response.read() #解码
return html
def get_page(url):
html = url_open(url).decode('utf-8')
a = html.find('current-comment-page') + 23
b = html.find(']', a)
return html[a:b]
def find_imgs(url):
html = url_open(url).decode('utf-8')
img_addrs = []
sort = []
a = html.find('img src=')
while a != -1:
b = html.find('.jpg', a, a + 255)
if b != -1:
img_addrs.append(html[a+9:b+4])
else:
b = a + 9
a = html.find('img src=', b)
for each in img_addrs:
sort.append('http:' + each)
return sort
def save_imgs(folder, img_addrs):
for each in img_addrs:
filename = 'F:\python\爬虫代码\\ooxx\\'+each.split('/')[-1] #在folder中创建一个filename的文本
with open(filename, 'wb') as f:
img = url_open(each) #将列表中图片的地址一个一个存入文件中
f.write(img)
def download_mm(folder='ooxx', pages=10):
url = "http://jandan.net/ooxx/"
page_num = int(get_page(url))
for i in range(pages):
page_num -= i
page_url = url + 'page-' + str(page_num) + '#comments' #图片地址
img_addrs = find_imgs(page_url) #找到图片的具体地址保存到列表中
save_imgs(folder, img_addrs) #那这个列表里面的图片保存起来
if __name__ == '__main__':
download_mm()