爬虫1

urllib

url = 'https://www.baidu.com/'

response = request.urlopen(url)
#获取网址内容
text = response.read()
 with open('./flower.jpg',mode = 'wb') as fp:
     fp.write(text)
     print('网页上的数据保存成功')

Request比较完整的获取,可以携带请求头等数据
request1=request.Request(url=url,header=header)

模拟 get请求

url = 'http://httpbin.org/get?%s'
if __name__ == '__main__':
    params = {'age':35,'sex':'男','work_years':15}
    # 经过url编码的数据
    params = urllib.parse.urlencode(params)
    # 参数data给定的话,说明请求是post
    # get请求,url所有的请求都是在网址进行暴露的
    response = urllib.request.urlopen(url = url%(params))
    print(response.read().decode())

模拟 post请求

url = 'http://httpbin.org/post'
if __name__ == '__main__':
    # 模拟post请求
    params = {'Language':'Python','salary':20000,'work_time':996}
    params = urllib.parse.urlencode(params).encode()#进行编码
    print(params)
    #默认代码发起请求时,请求头:Python-urllib/3.7
    response = urllib.request.urlopen(url=url, data=params)
    # 打印服务器返回的数据
    print(response.read().decode())

使用代理 进行访问网页

url = 'http://httpbin.org/ip'
    # 不适用代理发起请求
    response = urllib.request.urlopen(url = url)
    print(response.read().decode())
    # 使用代理,伪装,爬虫,封id
    ph = urllib.request.ProxyHandler({'http':'117.69.201.206:9999'})#代理地址可能不行
    # 打开者,打开url
    opener = urllib.request.build_opener(ph)
    # 使用代理打开一个网址
    response2 = opener.open(url)
    print('使用代理,ip是:',response2.read().decode())

requests使用代理

 response = requests.get(url=url,proxies = {'http':'http://455098435:lbrv3bgb@121.42.140.113:16816'},timeout = 20)
    print(response.text)

批量获取图片

import re
import requests
import threading
from concurrent.futures import ThreadPoolExecutor

import time

url1 = 'http://sc.chinaz.com/tupian/index.html'
url = 'http://sc.chinaz.com/tupian/index_%d.html'

# 线程池,优化
def download_image(img_url):
    # print(img_url)
    response = requests.get(img_url)
    filename = img_url.rsplit('/',1)[-1]
    with open('pictures/%s'%(filename),mode = 'wb') as fp:
        fp.write(response.content)
        # print('-------图片%s保存成功--------'%(filename))
def get_image_urls(num):
    start2 = time.time()
    for i in range(1,num + 1):
        if i == 1:
            url_pic = url1
        else:
            url_pic = url%(i)
        print('-------开始下载第%d页图片--------'%(i))
        response = requests.get(url_pic)
        response.encoding = 'utf-8'
        img_urls = re.findall(r'<img src2="(.*?)"', response.text)
        # img_url = 'hello'

        with ThreadPoolExecutor(1) as executor:
            for img_url in img_urls:

                # print(img_url)

                executor.submit(download_image,img_url)
    end2 = time.time()
    print("time2: " + str(end2 - start2))

            # t = threading.Thread(target = download_image,args = (img_url,))
            # t.start()
if __name__ == '__main__':
    try:
        num = int(input('请输入获取的页码数量:'))
    except:
        print('请输入数字!')
        num = int(input('请输入获取的页码数量:'))
    get_image_urls(num)

第三方库 lxml的用法xpath

# str 类型的数据
books = '''
<?xml version="1.0" encoding="utf-8"?>
<bookstore> 
  <book category="cooking"> 
    <title lang="en">Everyday Italian</title>  
    <author>Giada De Laurentiis</author>  
    <year>2005</year>  
    <price>30.00</price> 
  </book>  
  <book category="children"> 
    <title lang="en">Harry Potter</title>  
    <author>J K. Rowling</author>  
    <year>2005</year>  
    <price>29.99</price> 
  </book>  
  <book category="web" cover="paperback"> 
    <title lang="en">XQuery Kick Start</title>  
    <author>James McGovern</author>  
    <author>Per Bothner</author>  
    <author>Kurt Cagle</author>  
    <author>James Linn</author>  
    <author>Vaidyanathan Nagarajan</author>  
    <year>2003</year>  
    <price>50</price> 
  </book> 
</bookstore>
'''
html = etree.HTML(books)
result = html.xpath('/html/body/bookstore/book/@category')#查找['cooking', 'children', 'web']
# //不论位置,找到所有
    print(html.xpath('//book/@category'))#['cooking', 'children', 'web']
books = html.xpath('//book')
    print('第一本书,当前路径,查询',books[0].xpath('./year/text()'))#第一本书,当前路径,查询 ['2005']
print(html.xpath('//@*'))
#['cooking', 'en', 'children', 'en', 'web', 'paperback', 'en']
print(html.xpath('//book[price mod 2 = 0]/title/text()'))#价格能被2整除的

xpath案例

import requests
from lxml import etree
url1 = 'https://www.neihanba.com/dz/'
url = 'https://www.neihanba.com/dz/list_%d.html'
if __name__ == '__main__':
    fp = open('./duanzi.csv',mode = 'a',encoding='utf-8')
    for i in range(1,101):
        if i == 1:
            url_duanzi = url1
        else:
            url_duanzi = url%(i)
        response = requests.get(url_duanzi)
        response.encoding = 'gbk'
        content = response.text
        html = etree.HTML(content)
        result = html.xpath('//ul[@class="piclist longList"]/li')
        for li in result:
            try:
                title = li.xpath('.//h4/a/b/text()')[0]
                content = li.xpath('.//div[@class="f18 mb20"]/text()')[0].strip().strip('\n')
                info = ''.join(li.xpath('.//div[@class="ft"]/span//text()')[1:])
                fp.write('%s\t%s\t%s\n'%(title,content,info))
            except Exception as e:
                # 异常保存,第二天,分析,单独爬取。
                pass
        print('第%d页内容保存成功!'%(i))
    fp.close()
    # !!!缺少异常捕获
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值