网络爬虫技术详解-优快云博客

本文链接：https://blog.youkuaiyun.com/xiao_liushui/article/details/102825252

urllib

url = 'https://www.baidu.com/'

response = request.urlopen(url)
#获取网址内容
text = response.read()
 with open('./flower.jpg',mode = 'wb') as fp:
     fp.write(text)
     print('网页上的数据保存成功')

Request比较完整的获取,可以携带请求头等数据
request1=request.Request(url=url,header=header)

模拟 get请求

url = 'http://httpbin.org/get?%s'
if __name__ == '__main__':
    params = {'age':35,'sex':'男','work_years':15}
    # 经过url编码的数据
    params = urllib.parse.urlencode(params)
    # 参数data给定的话，说明请求是post
    # get请求，url所有的请求都是在网址进行暴露的
    response = urllib.request.urlopen(url = url%(params))
    print(response.read().decode())

模拟 post请求

url = 'http://httpbin.org/post'
if __name__ == '__main__':
    # 模拟post请求
    params = {'Language':'Python','salary':20000,'work_time':996}
    params = urllib.parse.urlencode(params).encode()#进行编码
    print(params)
    #默认代码发起请求时，请求头：Python-urllib/3.7
    response = urllib.request.urlopen(url=url, data=params)
    # 打印服务器返回的数据
    print(response.read().decode())

使用代理进行访问网页

url = 'http://httpbin.org/ip'
    # 不适用代理发起请求
    response = urllib.request.urlopen(url = url)
    print(response.read().decode())
    # 使用代理，伪装，爬虫，封id
    ph = urllib.request.ProxyHandler({'http':'117.69.201.206:9999'})#代理地址可能不行
    # 打开者，打开url
    opener = urllib.request.build_opener(ph)
    # 使用代理打开一个网址
    response2 = opener.open(url)
    print('使用代理，ip是：',response2.read().decode())

requests使用代理

 response = requests.get(url=url,proxies = {'http':'http://455098435:lbrv3bgb@121.42.140.113:16816'},timeout = 20)
    print(response.text)

批量获取图片

import re
import requests
import threading
from concurrent.futures import ThreadPoolExecutor

import time

url1 = 'http://sc.chinaz.com/tupian/index.html'
url = 'http://sc.chinaz.com/tupian/index_%d.html'

# 线程池，优化
def download_image(img_url):
    # print(img_url)
    response = requests.get(img_url)
    filename = img_url.rsplit('/',1)[-1]
    with open('pictures/%s'%(filename),mode = 'wb') as fp:
        fp.write(response.content)
        # print('-------图片%s保存成功--------'%(filename))
def get_image_urls(num):
    start2 = time.time()
    for i in range(1,num + 1):
        if i == 1:
            url_pic = url1
        else:
            url_pic = url%(i)
        print('-------开始下载第%d页图片--------'%(i))
        response = requests.get(url_pic)
        response.encoding = 'utf-8'
        img_urls = re.findall(r'<img src2="(.*?)"', response.text)
        # img_url = 'hello'

        with ThreadPoolExecutor(1) as executor:
            for img_url in img_urls:

                # print(img_url)

                executor.submit(download_image,img_url)
    end2 = time.time()
    print("time2: " + str(end2 - start2))

            # t = threading.Thread(target = download_image,args = (img_url,))
            # t.start()
if __name__ == '__main__':
    try:
        num = int(input('请输入获取的页码数量：'))
    except:
        print('请输入数字！')
        num = int(input('请输入获取的页码数量：'))
    get_image_urls(num)

第三方库 lxml的用法xpath

# str 类型的数据
books = '''
<?xml version="1.0" encoding="utf-8"?>
<bookstore> 
  <book category="cooking"> 
    <title lang="en">Everyday Italian</title>  
    <author>Giada De Laurentiis</author>  
    <year>2005</year>  
    <price>30.00</price> 
  </book>  
  <book category="children"> 
    <title lang="en">Harry Potter</title>  
    <author>J K. Rowling</author>  
    <year>2005</year>  
    <price>29.99</price> 
  </book>  
  <book category="web" cover="paperback"> 
    <title lang="en">XQuery Kick Start</title>  
    <author>James McGovern</author>  
    <author>Per Bothner</author>  
    <author>Kurt Cagle</author>  
    <author>James Linn</author>  
    <author>Vaidyanathan Nagarajan</author>  
    <year>2003</year>  
    <price>50</price> 
  </book> 
</bookstore>
'''

html = etree.HTML(books)
result = html.xpath('/html/body/bookstore/book/@category')#查找['cooking', 'children', 'web']

# //不论位置，找到所有
    print(html.xpath('//book/@category'))#['cooking', 'children', 'web']

books = html.xpath('//book')
    print('第一本书，当前路径，查询',books[0].xpath('./year/text()'))#第一本书，当前路径，查询 ['2005']

print(html.xpath('//@*'))
#['cooking', 'en', 'children', 'en', 'web', 'paperback', 'en']

print(html.xpath('//book[price mod 2 = 0]/title/text()'))#价格能被2整除的

xpath案例

import requests
from lxml import etree
url1 = 'https://www.neihanba.com/dz/'
url = 'https://www.neihanba.com/dz/list_%d.html'
if __name__ == '__main__':
    fp = open('./duanzi.csv',mode = 'a',encoding='utf-8')
    for i in range(1,101):
        if i == 1:
            url_duanzi = url1
        else:
            url_duanzi = url%(i)
        response = requests.get(url_duanzi)
        response.encoding = 'gbk'
        content = response.text
        html = etree.HTML(content)
        result = html.xpath('//ul[@class="piclist longList"]/li')
        for li in result:
            try:
                title = li.xpath('.//h4/a/b/text()')[0]
                content = li.xpath('.//div[@class="f18 mb20"]/text()')[0].strip().strip('\n')
                info = ''.join(li.xpath('.//div[@class="ft"]/span//text()')[1:])
                fp.write('%s\t%s\t%s\n'%(title,content,info))
            except Exception as e:
                # 异常保存，第二天，分析，单独爬取。
                pass
        print('第%d页内容保存成功！'%(i))
    fp.close()
    # ！！！缺少异常捕获

爬虫1

urllib