request和xpth的案例_page-box house-lst-page-box-优快云博客

本文链接：https://blog.youkuaiyun.com/zxcvbbnn/article/details/106772983

案例

斗图啦爬取
- - 保存数据的时候，什么时候用response.text ，什么时候用response.content？
链家爬取
- - - 这个案例爬取下来得数据是要保存到数据库当中得，所以要创建数据库
蔬菜网爬取（指定时间爬取）post请求，上面是get
下厨房爬取

斗图啦爬取

只要两个标签不挨着，就用//，挨着的话，就用/，还有，@是艾特属性的，像src啊，a啊这些

保存数据的时候，什么时候用response.text ，什么时候用response.content？

说如果想取文本数据可以通过response.text 如果想取图片，文件，则可以通过 response.content

#取标题
"""

//div[@class = "col-sm-9 center-wrap"]//a/div[@class="random_title"]/text()
"""
#取所有的a标签
"""
//div[@class = "col-sm-9 center-wrap"]//a
"""
#图片
"""
//div[@class = "col-sm-9 center-wrap"]//a//div[@class="random_article"]//img/@src
"""
import requests
from lxml import etree
#解析的时候需要
import os

class DTSpider():
    def __init__(self):
        self.url = 'https://www.doutula.com/article/list/?page='
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            }
    #发送请求
    def send_request(self,url):
        response = requests.get(url=url,headers = self.headers)
        return response



    def parse_content(self,response):

        html = response.text
        content = etree.HTML(html)#解析
        a_list = content.xpath('//div[@class = "col-sm-9 center-wrap"]//a')
        print(a_list)#列表
        for a in a_list:
            title_list = a.xpath('./div[@class="random_title"]/text()')
            #.表示接着上个xpath后面继续取值
            img_list = a.xpath('.//div[@class="random_article"]//img/@data-original')

            #保存，新建一个文件夹，title就是他的文件夹的名字
            if title_list:
                if not os.path.exists('doutu/'+title_list[0]):#因为title_list是列表，并且每个列表就有一个值
                    os.mkdir('doutu/'+title_list[0])
                    #先判断一下，没有的话就用这个名字创建
                    #注意，doutu需要自己手动创建，不然他找不到

            #保存图片，但是下面这种保存图片的话，没有顺序。有时候人家的表情包是连着的，
            #所以我们按顺序给他爬下来
            # for pic in img_list:
            #     response = self.send_request(pic)#发送图片请求
            #     name = img_list[-20]
            #     self.save_content(response,name,'doutu/'+title_list[0])

            #enumerate可以给图片安排索引
            for index,pic in enumerate(img_list):
                print(index,pic)

                response = self.send_request(pic)#发送图片请求
                name = str(index+1) +'_'+pic[-13:]
                #因为第一张图片的索引是0，可能看着奇怪，就加上1
                print(response)
                self.save_content(response,name,'doutu/'+title_list[0])

    def save_content(self,response,name,path):
        with open(path+ '/' +name,'wb') as f:
            f.write(response.content)


    def start(self):
        for i in range(1,2):
            full_url = self.url + str(i)
            response = self.send_request(full_url)
            self.parse_content(response)



if __name__ == '__main__':
    dt = DTSpider()
    dt.start()

链家爬取

这个案例爬取下来得数据是要保存到数据库当中得，所以要创建数据库

create database db_lianjia charset utf8；
create table if not exists lianjia(
agentInfoList varchar(255),
title varchar(255),
houseInfo varchar(255),
dealDate varchar(255),
totalPrice varchar(255),
dealCycleTxt varchar(255),
positionInfo varchar(255),
unitPrice varchar(255)
)charset utf8;

import requests
from lxml import etree
import pymysql
import json

class LJSpider():
    def __init__(self):
        self.url = 'https://bj.lianjia.com/chengjiao/pg%d'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
            }
        #连接数据库
        self.connection = pymysql.connect(host='localhost',
                                     user='root',
                                     password='123456',
                                     db='db_lianjia',
                                     )
        self.cursor = self.connection.cursor()
        self.pn = 1


    def send_request(self,url):
        response = requests.get(url=url,headers=self.headers)
        if response.status_code == 200:
            return response


    def parse_content(self,response):
        html = response.text
        content = etree.HTML(html)#解析
        with open('lianjia.html','w',encoding='utf-8') as f:
            f.write(response.text)
        li_list = content.xpath('//ul[@class="listContent"]/li')#列表
        for li in li_list:
            img_list = li.xpath('./a/img/@src')
            if img_list:
                img = img_list[0]
            else:
                img = ""
            title = "".join(li.xpath('.//div[@class="title"]/a/text()'))
            houseInfo = li.xpath('.//div[@class="houseInfo"]/text()')[0]
            dealDate = li.xpath('.//div[@class="dealDate"]/text()')[0]
            totalPrice = "".join(li.xpath('.//div[@class="totalPrice"]//text()'))
            positionInfo = li.xpath('.//div[@class="positionInfo"]/text()')[0]
            unitPrice = li.xpath('.//div[@class="unitPrice"]/text()')[0]
            dealHouseTxt = "".join(li.xpath('.//div[@class="unitPrice"]//text()'))
            dealCycleTxt = li.xpath('.//span[@class = "dealCycleTxt"]/text()')
            agentInfoList = li.xpath('.//div[@class="agentInfoList"]/a/text()')[0]
            #



            dict = {}
            dict['agentInfoList'] = agentInfoList
            dict['title'] = title
            dict['houseInfo'] = houseInfo
            dict['dealDate'] = dealDate
            dict['totalPrice'] = totalPrice
            dict['dealCycleTxt'] = dealCycleTxt
            dict['positionInfo'] = positionInfo
            dict['unitPrice'] = dealHouseTxt
            #dict['dealHouseTxt'] = dealHouseTxt
            self.save_content(dict)
        #next = content.xpath('//div[@class="page-box fr"]//a[last()]/@href')
        next_text = content.xpath('//div[@class="page-box fr"]//div/@page-data')[0]
        print(next_text)
        totalPage = json.loads(str(next_text))['totalPage']
        if self.pn < totalPage:
            self.pn+=1
            full_url = self.url % (self.pn)
            response = self.send_request(full_url)
            if response:
                self.parse_content(response)

        #curPage = json.loads(str(next_text))['curPage']
        # if next_text == '下一页':
        #
        #     response = self.send_request(next)
        #     if response:
        #         self.parse_content(response)





    def save_content(self,dict):
        sql = "insert into `db_lianjia` (`agentInfoList`,`title`,`houseInfo`,`dealDate`,`totalPrice`,`dealCycleTxt`,`positionInfo`,`unitPrice`) values (%s,%s,%s,%s,%s,%s,%s,%s)"
        self.cursor.execute(sql,[v for v in dict.values()])
        self.connection.commit()
    def start(self):
        full_url = self.url%(self.pn)
        response = self.send_request(full_url)
        if response:
            self.parse_content(response)


if __name__ == '__main__':
    lj = LJSpider()
    lj.start()

xpath取文本

xpath取文本的小技巧
在这里插入图片描述

totalPrice = "".join(li.xpath('.//div[@class="totalPrice"]//text()'))
#这样取到的值完整
#如果像下面这么取值的话
totalPrice = li.xpath('.//div[@class="totalPrice"]//text()')[0]
#这么取只能取到数字，

注意想要把字典当中的值取出来，可以借助json.loads(),把json转换成字符串，而dumps()是把字符串转化成json

把链家用csv保存（另一个版本）

import requests
from lxml import etree
import urllib.parse

import urllib.parse
import json


# import urllib.parse


class LiaJiaSpider():
    def __init__(self):
        self.base_url = 'https://bj.lianjia.com/ershoufang/pg{}/'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
        }

        self.page = 0

    def send_request(self, full_url):
        print(full_url)
        response = requests.get(url=full_url, headers=self.headers)
        if response.status_code == 200:
            return response

    def parese_content(self, response):

        html = etree.HTML(response.text)

        with open('lianjia.html', 'w') as f:
            f.write(response.text)

        next = html.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
        li_list = html.xpath('//ul[@class="sellListContent"]/li')
        info = []
        for li in li_list:
            ctx = {}
            title = li.xpath('.//div[@class="title"]/a/text()')

            if title:
                title = title[0]
                print(title)

                ctx['title'] = title

            price = li.xpath('.//div[@class="totalPrice"]/span/text()')
            if price:
                price = price[0]
                print(price)
                ctx['price'] = price
            info.append(ctx)

            self.save_content(info)

        if self.page == 1:
            self.totalPage = json.loads(next).get('totalPage')

        # print(totalPage)
        # if next == '下一页':
        #     href = html.xpath('//div[@class="page-box house-lst-page-box"]/a[last()]/@href')
        #     next_url = urllib.parse.urljoin(self.base_url, href)
        #     print(next_url)
        #     # self.send_request()

    def save_content(self, info):
        for i in info:
            with open('info.csv', 'a') as f:
                f.write(i.get('title') + '\t' + i.get('price') + '万' + '\n')

    def start(self):

        while True:
            self.page += 1
            full_url = self.base_url.format(self.page)
            response = self.send_request(full_url)
            self.parese_content(response)

            if self.page == self.totalPage:
                break

        # for i in range(100, 101):
        #     full_url = self.base_url.format(i)
        #     print(full_url)
        #
        #     response = self.send_request(full_url)
        #
        #     self.parese_content(response)


if __name__ == '__main__':
    ljs = LiaJiaSpider()
    ljs.start()

蔬菜网爬取（指定时间爬取）post请求，上面是get

import requests
import json
import time


class PriceSpider():
    def __init__(self):
        self.url = 'http://www.cncyms.cn/pages.php'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
        }

        self.form_data = {
            'pageNum': 0,
            'pname': '',
            'reltime': '蔬菜',
        }

    def get_time(self, date):
        # 转为时间数组
        timeArray = time.strptime(date, "%Y-%m-%d")
        timeStamp = int(time.mktime(timeArray))
        return timeStamp  # 1381419600

    def send_request(self):
        response = requests.post(self.url, data=self.form_data, headers=self.headers)

        if response.status_code == 200:
            return response

    def parse_content(self, response):
        json_content = response.json()  # 变成字典了
        # print(json_content)

        list = []
        for data in json_content.get('list'):
            print(data)
            releasetime = self.get_time(data.get('ReleaseTime'))

            if self.start_time != '1' and self.end_time != '1':
                if releasetime <= self.end_time and releasetime >= self.start_time:
                    print(data)
                    list.append(data)

        content = json.dumps(list, ensure_ascii=False)  # 不让他用这样的编码方式
        self.save_content(content)
        # content = json.dumps(json_content, ensure_ascii=False)  # 不让他用这样的编码方式
        # self.save_content(content)

    def save_content(self, content):
        with open('price.txt', 'a', encoding='utf8') as f:
            f.write(content + '\n')

    def start(self):
        try:
            num = int(input('请输入要爬取多少页'))
            name = input('请输入指定菜品，输入：1默认全部')
            self.start_time = input('请输入开始时间，输入1默认全部 2020-06-02')  # 20200602
            self.end_time = input('请输入结束时间,输入1默认全部 2020-06-02')  # 2020-06-05

            if self.start_time != '1' and self.end_time != '1':
                self.start_time = self.get_time(self.start_time)
                self.end_time = self.get_time(self.end_time)
                if self.start_time > self.end_time:
                    print('结束时间必须大于开始时间')

            if not name == '1':
                self.form_data['pname'] = name

            for i in range(0, num):
                self.form_data['pageNum'] = i
                response = self.send_request()
                self.parse_content(response)
        except Exception as e:
            print('输入有误', e)


if __name__ == '__main__':
    ps = PriceSpider()
    ps.start()

下厨房爬取

import requests
from lxml import etree
import urllib.parse


class XiaChuFangSpider():

    def __init__(self):
        self.url = 'https://www.xiachufang.com/category/40076/?page='
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
        }

    def send_request(self, full_url):
        response = requests.get(full_url, headers=self.headers)

        if response.status_code == 200:
            return response

    def parse_content(self, response):

        html = etree.HTML(response.text)
        href_list = html.xpath('//p[@class="name"]/a/@href')

        for href in href_list:
            detail_url = urllib.parse.urljoin(self.url, href)
            print(detail_url)
            response = self.send_request(detail_url)
            self.parse_detail(response)

    def parse_detail(self, response):
        html = etree.HTML(response.text)
        with open('xiachufang.html', 'w') as f:
            f.write(response.text)
        title = html.xpath('//h1/text()')[0]

        tr_list = html.xpath('//div[@class="ings"]//tr')
        print(tr_list)
        zuoliao = ''
        for tr in tr_list:
            zuoliao += "\t".join(tr.xpath('./td//text()')).strip().replace('\n', '').replace(' ', '')
            print(zuoliao)

        step = "".join(html.xpath('//li[@class="container"]//p//text()'))

        self.save_content(title, zuoliao, step)

    def save_content(self, title, zuoliao, step):
        with open("xiachufang/" + title + '.txt', 'w') as f:
            f.write(zuoliao + "\n\n")
            f.write(step + "\n\n")

    def start(self):
        for i in range(1, 2):
            full_url = self.url + str(i)
            print(full_url)

            response = self.send_request(full_url)

            self.parse_content(response)


if __name__ == '__main__':
    xcfs = XiaChuFangSpider()
    xcfs.start()