php获取网址返回json,4、Scrapy框架，爬取网站返回json数据（spider源码)

最新推荐文章于 2023-12-26 16:57:13 发布

转载最新推荐文章于 2023-12-26 16:57:13 发布 · 378 阅读

文章标签：

该博客介绍了使用Python Scrapy框架进行网页爬取的方法，包括构造HTTP请求头、发送POST请求获取JSON数据、解析数据并进行增量抓取。通过实例展示了如何处理JSON响应并提取关键信息，如业务ID、组织名和创建日期等，并进一步请求详细图片信息。

~~~

# -*- coding: utf-8 -*-

import scrapy

from scrapy.http import Request

from kunnanyuan.spider.spider.common import deal_date, transfrom, get_id

from ..items import XkItem

import json

class XkSdl10822Spider(scrapy.Spider):

name = 'XK-FJM-0102'

url = 'http://222.76.243.118:8090/publicity/get_double_publicity_record_list'

#构造请求头(postman，网站调试器，apipost，当然我推荐的是博客————爬虫骚操作之30秒写爬虫(实用)直接转化为python格式复制过来就行，几秒解决)

headers = {

'Origin': 'http://222.76.243.118:8090',

'Accept-Encoding': 'gzip, deflate',

'Accept-Language': 'zh-CN,zh;q=0.9',

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',

#这里必须有，才会有下面的解析方式

'Content-Type': 'application/json; charset=UTF-8',

'Accept': 'application/json, text/javascript, */*; q=0.01',

'Referer': 'http://222.76.243.118:8090/page/double_publicity/allow.html',

'Connection': 'keep-alive',

}

#构造起始start_requests

def start_requests(self):

#偷懒没写遍历去限制每次请求的数据，直接一下拿了，主要是为了后面增量考虑

data = {

'listSql': '',

'linesPerPage': "6704",

'currentPage': "1",

'deptId': '',

'searchKeyword': '',

'tag': 'ALLOW'

}

yield scrapy.Request(url=self.url, body=json.dumps(data), method='POST', headers=self.headers,callback=self.parse_list)

#把数据切割分成一页多少个

# def parse_page(self, response):

# self.parse_list(response)

# if self.page == 1:

#以下省略

def parse_list(self, response):

#返回json，转化python字典类型

tr1 = json.loads(response.text)

#把 tr1看成一个大字典，取键拿值

if tr1.get("message") == "请求成功":

data = tr1.get('data')#也可以写成 data = tr1.【'data'】下面类似

list = data.get('list')

#这里就是遍历每个json里面的数据了

for i in list:

if i['legalPersonDocNumber'] is not None:

identifier = i['legalPersonDocNumber']

else:

identifier = i['naturalPersonDocNumber']

if i['jgFr'] is not None:

organization = i['jgFr']

else:

organization = i['jgZr']

businessId = i['businessId']

id = i['id']

objectType = i['objectType']

createdAt = deal_date(i['businessCreateDate'].split('000000')[0])

source_url = "http://222.76.243.118:8090/page/double_publicity/publicity_detail.html" + "?id={}&businessId={}&tag=ALLOW&objectType={}".format(str(id), str(businessId),str(objectType))

prPrincipal = i['objectName']

data = {

"businessId": businessId,

"id": id,

'objectType': objectType,

'tag': "ALLOW",

'pictureMinHeight': '628',

'pictureMinWidth': '1200'

}

url = "http://222.76.243.118:8090/publicity/get_publicity_detail_picture"

yield Request(url, callback=self.parse4, body=json.dumps(data), method='POST', headers=self.headers,

meta={"identifier": identifier, "organization": organization, "businessId": businessId,

"createdAt": createdAt, "source_url": source_url, "prPrincipal": prPrincipal})

#解析并把item传出去

def parse4(self, response):

item = XkItem()

item['identifier'] = response.meta["identifier"]

item['organization'] = response.meta["organization"]

print(item['organization'])

# item['businessId'] = response.meta["businessId"]

item['createdAt'] = response.meta["createdAt"]

item['source_url'] = response.meta['source_url']

item['prPrincipal'] = response.meta['prPrincipal']

item['type'] = transfrom(str(item['organization']))

item['fileType'] = "jpg"

item['pid'] = get_id(str(item['identifier']))

item['idMethod'] = '2'

tr2 = json.loads(response.text)

if tr2.get("message") == "请求成功":

data = tr2.get('data')

path = data.get('path')

item['images'] = "http://222.76.243.118:8090/" + path

yield item

~~~

或者：

~~~

#coding=utf-8

import scrapy

import json

class DmozSpider(scrapy.Spider):

name = "dmoz"

allowed_domains = ["dmoz.org"]

start_urls = [

"http://www.test.com/test/get_data"

]

def parse(self, response):

# 调用body_as_unicode()是为了能处理unicode编码的数据

sites = json.loads(response.body_as_unicode())

#print sites['k']

numbers = sites['k'].split(',')

print numbers

~~~