一、前言
-
你是否又遇到scrapy框架中FormRequests的参数选择?formdata还是body?
-
遇到下面这样的网页,你该如何选择参数导入呢?注意对比红色框框内的区别
-
for两个橘子
-
二、scrapy框架中FormRequest请求的formdata和body
两者的区别不大,主要是多尝试使用即可,具体如下:
1.formdata
scrapy源码中
class FormRequest(Request):
valid_form_methods = ['GET', 'POST']
def __init__(self, *args, **kwargs):
formdata = kwargs.pop('formdata', None)
if formdata and kwargs.get('method') is None:
kwargs['method'] = 'POST'
super(FormRequest, self).__init__(*args, **kwargs)
if formdata:
items = formdata.items() if isinstance(formdata, dict) else formdata
querystr = _urlencode(items, self.encoding)
if self.method == 'POST':
self.headers.setdefault(b'Content-Type', b'application/x-www-form-urlencoded')
self._set_body(querystr)
else:
self._set_url(self.url + ('&' if '?' in self.url else '?') + querystr)
- 其中有一行,items = formdata.items() if isinstance(formdata, dict) else formdata,要求的formdata为dict
- 这下我们知道了,使用formdata时要传入一个dict
- dict的值传入时需要注意: dict的value,不能为int,例如3要转化为"3"
- for一个橘子
yield scrapy.FormRequest(
url=f"http://example.com",
headers=self.fake_headers(jsessionid),
callback=self.handle_response,
formdata=self.fake_data({
"page": "123",
"TimeStr": "2000-01-01,2021-01-30",
"allDq": "reset2",
"allHy": "reset1",
"AllPtName": "",
"KeyStr": "",
"KeyType": "ggname", }),)
2.body
- body参数一点需要注意,必须是str类型的
- 所以如果需要传入的数据是dict,要使用json的dumps()方法将其转化为str
- for一个橘子
yield scrapy.FormRequest(
url="http://example.com",
callback=self.handle_response,
dont_filter=True,
body=json.dumps({
"accuracy": "",
"cl": 200,
"cnum": "001",}),
meta={ 'page_pn': pn,})
三、小试牛刀
- 给大家10分钟时间自己尝试一下前言中的两个例子的访问方法,稍后再看下面的解析哦
四、参考
1.案例一
河北省公共资源交易平台 http://ggzy.hebei.gov.cn/hbggfwpt
详情页:http://ggzy.hebei.gov.cn/hbggfwpt/jydt/salesPlat.html
# 详情页:http://ggzy.hebei.gov.cn/hbggfwpt/jydt/salesPlat.html
# 将以下代码放置到scrapy的spider类中:(cookie可能失效了,自己去网站上换一下)
class XXXXSpider(scrapy.Spider):
name="XXX"
def fake_headers(self):
return {
"Host": "ggzy.hebei.gov.cn",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With": "XMLHttpRequest",
# "Content-Length": "455",
"Origin": "http://ggzy.hebei.gov.cn",
"Connection": "keep-alive",
"Referer": "http://ggzy.hebei.gov.cn/hbggfwpt/jydt/salesPlat.html",
"Cookie": "JSESSIONID=FDADCC976DFAD9C70E024249EE5A6886; oauthClientId=demoClient; oauthPath=http://127.0.0.1:8000/EpointWebBuilderZw; oauthLoginUrl=http://127.0.0.1:8000/EpointWebBuilderZw/rest/oauth2/authorize?client_id=demoClient&state=a&response_type=code&scope=user&redirect_uri=; oauthLogoutUrl=http://127.0.0.1:8000/EpointWebBuilderZw/rest/oauth2/logout?redirect_uri=; noOauthRefreshToken=719d81f71e7b2a7e7a67d08e1757f061; noOauthAccessToken=fd0ea75de4db1067e97d6870ed65303e",
}
def start_requests(self):
print("gogogo")
headers = self.fake_headers()
yield scrapy.FormRequest(
url="http://ggzy.hebei.gov.cn/inteligentsearch/rest/inteligentSearch/getFullTextData",
method="POST",
headers=headers,
body='{"token":"","pn":310,"rn":10,"sdt":"","edt":"","wd":" ","inc_wd":"","exc_wd":"","fields":"title",'
'"cnum":"001","sort":"{\\"webdate\\":0}","ssort":"title","cl":200,"terminal":"",'
'"condition":[{"fieldName":"categorynum","equal":"003001","notEqual":null,"equalList":null,'
'"notEqualList":null,"isLike":true,"likeType":2}],"time":null,"highlights":"title","statistics":null,'
'"unionCondition":null,"accuracy":"","noParticiple":"0","searchRange":null,"isBusiness":"1"}',
callback=self.handle_response,
)
def handle_response(self, response):
print(response.text)
print(response.status)
2.案例二
浙江省公共资源交易平台:http://www.zjpubservice.com/
详情页:http://www.zjpubservice.com/jyxxgk/list.html
def get_payloaddata(self, start_num, categoryname, city_code):
return {
"token": "",
"pn": 100,
"rn": 12,
"sdt": "",
"edt": "",
"wd": "null",
"inc_wd": "",
"exc_wd": "",
"fields": "title",
"cnum": "001",
"sort": "{\"webdate\": \"0\"}",
"ssort": "title",
"cl": 200,
"terminal": "",
"condition": [{
'fieldName': "categorynum",
'isLike': 'true',
'likeType': 2,
'equal': f"{categoryname}",
}, {
'fieldName': "infoc",
'isLike': 'true',
'likeType': 2,
'equal': f"{city_code}",
}],
"time": [{
'fieldName': "webdate",
'startTime': "2000-11-11 00:00:00",
'endTime': f"{get_current_date()} 23:59:59"
}],
"highlights": "",
"statistics": "null",
"unionCondition": "null",
"accuracy": "",
"noParticiple": "0",
"searchRange": "null",
"isBusiness": "1"
}
def start_requests(self):
print("gogogo")
yield scrapy.FormRequest(url="http://www.zjpubservice.com/inteligentsearch/rest/inteligentSearch/getFullTextData",
headers={"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Content-Type": "application/json;charset=UTF-8",
"Cookie": f"HttpOnly; JSESSIONID=A2705C89D6356F3EC1B904A3FC5CAFA6; HttpOnly; oauthClientId=demoClient; "
f"oauthPath=http://223.4.69.84:8080/EpointWebBuilder; oauthLoginUrl=http://127.0.0.1/"
f"membercenter/login.html?redirect_uri=; oauthLogoutUrl=; noOauthRefreshToken="
f"1dc347b4d59250e0d344fa1a896e6808; noOauthAccessToken=bf3606f81125ad67d3fa1132ffc360dd; {SERVERID}",# 这里去网站上找一下cookie,失效很快,没有贴上来
# "Host": "www.zjpubservice.com",
# "Origin": "http://www.zjpubservice.com",
"Proxy-Connection": "keep-alive",
# "Referer": "http://www.zjpubservice.com/jyxxgk/list.html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.58",
"X-Requested-With": "XMLHttpRequest",},
callback=self.handle_response,
body=json.dumps(self.get_payloaddata()),
dont_filter=True,
)
def handle_response(self, response):
print(response.text)
print(response.status)
五、小技巧
利用Fiddler Everywhere观察请求的差别
1.先在网页中正常打开网页
- 1)此时抓取该网站对应的包,保存下来,下面是【Fiddler Everywhere 浏览器正常打开】截图
- 2)将标签栏点到body一栏,具体关注Form-Data这一栏,下面是【Fildder Everywhere 浏览器正常打开】截图
下面是【QQ浏览器调试页面 浏览器正常打开】
通过网页中的截图【上图中A位置】和Fiddler Everywhere可以发现,该页面【http://ggzy.hebei.gov.cn/inteligentsearch/rest/inteligentSearch/getFullTextData】作为一个post请求,它的参数栏是Form Data【也就是浏览器栏目【上图中B位置】】,该xhr请求的参数是一个字典M,字典M的key是一个很长的字典N,而字典M的value是一个空字符串。如果利用formdata(即传入一个字典),则它的key太过复杂,所以这时就避免使用formdata,而建议使用body参数传参
2.利用python模拟
- 此时,我们的目的很明确,即scrapy框架中使用post请求,body参数,url为"http://ggzy.hebei.gov.cn/inteligentsearch/rest/inteligentSearch/getFullTextData"
yield scrapy.FormRequests(
url="",
method="POST",# 可以省略
body=self.fake_data(0),)
def fake_data(self, pn):
body = '{"token":"","pn":' + str(
pn) + ',"rn":10,"sdt":"","edt":"","wd":" ","inc_wd":"","exc_wd":"","fields":"title",' \
'"cnum":"001","sort":"{\"webdate\":0}","ssort":"title","cl":200,"terminal":"",' \
'"condition":[{"fieldName":"categorynum","equal":"003001002","notEqual":null,"equalList":null,' \
'"notEqualList":null,"isLike":true,"likeType":2}],"time":null,"highlights":"title","statistics":null,' \
'"unionCondition":null,"accuracy":"","noParticiple":"0","searchRange":null,"isBusiness":"1"}'
return body
- 但是在运行过程中发现,上述的请求失败,无法获取网页,通过fiddler everywhere找到该请求,查看Body-FormData
- 不看不知道,一看这里就发现了,下图中的红色方框内少了两个反斜杠
- 所以我们修改一下代码(也就是上面的代码),给反斜杠加上转义
yield scrapy.FormRequests(
url="",
method="POST",# 可以省略
body=self.fake_data(0),)
def fake_data(self, pn):
body = '{"token":"","pn":' + str(
pn) + ',"rn":10,"sdt":"","edt":"","wd":" ","inc_wd":"","exc_wd":"","fields":"title",' \
'"cnum":"001","sort":"{\\"webdate\\":0}","ssort":"title","cl":200,"terminal":"",' \
'"condition":[{"fieldName":"categorynum","equal":"003001002","notEqual":null,"equalList":null,' \
'"notEqualList":null,"isLike":true,"likeType":2}],"time":null,"highlights":"title","statistics":null,' \
'"unionCondition":null,"accuracy":"","noParticiple":"0","searchRange":null,"isBusiness":"1"}'
return body
这样就可以了,写的比较乱。有什么问题,可以在下面留言!!!稀饭的也可以点个关注,一起爬一起嗨