单字段测试 xpath+jpath+re+requests+phantomjs

最新推荐文章于 2021-04-19 13:50:50 发布

原创最新推荐文章于 2021-04-19 13:50:50 发布 · 709 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#单字段测试 #xpath #jpath #re

Python 同时被 2 个专栏收录

22 篇文章

订阅专栏

爬虫-从入坑到脱坑

18 篇文章

订阅专栏

本文介绍了一个用于简化XPath、JPath和正则表达式测试的工具类，该工具类支持单字段测试，并提供了方便快捷的方法来处理各种数据抓取需求。

xpath+jpath+re单字段测试

最近测试发现这三个每次用都要重复写的东西太多了,然后封装了一下,做了一个单字段测试的类和接口,方便以后测试使用,只需要把类和包导入然后就可以直接使用了,简单方便

import json, re
from jsonpath import jsonpath
import chardet
from lxml import etree

class Spiders(object):
    def jpath(self, html,regex):
        body = str(html)  # 可能有乱码问题
        if isinstance(body, str) or isinstance(body, str):
            body = json.loads(body)
        try:
            result = jsonpath(body, regex)
        except Exception as  e:
            result = []
            print (e)
        if not result:
            result = []
        return result

    def rpath(self, html,regex):
        try:
            body = html
            detector = chardet.detect(str(body))
            if detector.get("encoding") == "ascii":
                body = str(body).decode("unicode-escape")
        except Exception as e:
            body = str(html)
        result = re.findall(regex, body)
        return result

    def xpath(self, html,regex):
        parse_data = [""]
        if regex:
            regexs = regex.split("&")
            try:
                for i in range(len(regexs)):
                    xml = etree.HTML(html)
                    result = xml.xpath(regexs[i].strip())
                    if result:
                        return result
                return []
            except Exception as  e:
                print (e)
        return parse_data

接口用的是post传参方式,根据困难度选择式调用requests或是phantomjs

def spider_xpath(request):
    result = {}
    # datas = []
    spider = Spiders()
    try:
        data = request.body
        data = json.loads(data)
        url = data['url']    #url
        rule = data['rule']  #抓取规则
        ru_type = data['type'] #xpath,re,jpath
        difficult = data['difficult'] #普通(a),特殊(b)
        if difficult == 'a':
            res = requests.get(url,headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}).text
        else:
            browser = webdriver.PhantomJS(executable_path=r"../phantomjs_w/bin/phantomjs.exe")
            browser.get(url)
            time.sleep(2)
            res = browser.page_source
        if ru_type == 'xpath':
            datas = spider.xpath(res,rule)
        elif ru_type == 'rpath':
            datas = spider.rpath(res,rule)
        else:
            datas = spider.jpath(res,rule)
        detector = chardet.detect(str(datas))
        if detector.get("encoding") == "ascii":
            datas = str(datas).decode("unicode-escape")
        print datas

        result['page'] = datas
        result['message'] = '测试成功'
        result['code'] = 1
        print '成功'
    except Exception as e:
        print e
        result['message'] = '测试失败'
        result['code'] = -1
    return HttpResponse(json.dumps(result, cls=JSONEncoder), content_type="application/json")