前嗅ForeSpider脚本教程-链接抽取:链接在POST请求里写脚本

本文详细介绍了使用前嗅ForeSpider进行链接抽取的实战教程,包括如何处理链接在POST请求中的情况,通过示例展示了如何利用浏览器开发者工具定位链接数据,以及在JSON数组中循环抽取链接的具体脚本实现。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

今天,小编给大家带来的教程为:前嗅ForeSpider脚本教程-链接抽取中,链接在POST请求里写脚本的实战教程。具体内容如下:

当链接地址在源码中不存在,存在于post请求中时,需要使用浏览器的开发者工具来查找链接数据。

1.链接需要循环

场景:一组链接存在于JSON的某个数组中。

示例:采集豆瓣电影,列表页的电影链接。

查看源文件可知,源文件中只有一部分数据,而下拉列表出现的其他电影信息不存在源码中,此时我们需要在请求的响应正文中查找数据。在该页面右键点击“审查元素”,下拉,点击“加载更多”,出现一个请求包,查看Response,发现链接地址存在于,返回正文中。

将Response中的信息粘贴到json查看器中,以notepad++插件“JSON Viewer”为例,定位数据存在路径。

脚本实例:

var urs = “https://movie.douban.com/j/search_subjects?type=movie&t ag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=60”;

var groc = EXTRACT.OpenDoc(CHANN,urs,0);

if(groc){

var jsonStr = groc.GetDom().GetSource().ToStr();

jScript js;

var json = js.RunJson(jsonStr);

var arr = json.subjects;

for(int i=0;i<arr.size;i++)

{
  url u;

  u.title=””;

  u.urlname=arr[i].url;

  u.entryid=CHANN.id;

  u.tmplid=2;

  RESULT.AddLink(u);

 }

}

2.不循环

场景:少数链接是JSON的某个键值。

示例:暂无。

转载于:https://my.oschina.net/forespider/blog/3035450

import json #处理 JSON 字符串 from typing import Iterable #解码 POST 表单字符串 import urllib.parse #用 BeautifulSoup 解析 HTML 页面 import bs4 #读取 Excel / CSV 文件 import scrapy #Scrapy 框架核心 import pandas #发送模拟表单 POST 请求 from scrapy import FormRequest, Request from paper.items import ZhiwangItem #自定义数据结构 import os import subprocess root_path = os.getcwd() class ZhiwangspiderSpDDider(scrapy.Spider): name = "zhiwangspider" allowed_domains = ["kns.cnki.net"] #限定爬虫的网站 cookies = { "Ecp_ClientId": "h250415025900118829", "Ecp_IpLoginFail": "250418139.227.167.167", "SID_kns_new": "kns2618105", "updatetime-advInput": "2025-04-18 09:44:47", "knsadv-searchtype": '{"BLZOG7CK":"gradeSearch,majorSearch","MPMFIG1A":"gradeSearch,majorSearch,sentenceSearch","T2VC03OH":"gradeSearch,majorSearch","JQIRZIYA":"gradeSearch,majorSearch,sentenceSearch","S81HNSV3":"gradeSearch","YSTT4HG0":"gradeSearch,majorSearch,authorSearch,sentenceSearch","ML4DRIDX":"gradeSearch,majorSearch","WQ0UVIAA":"gradeSearch,majorSearch","VUDIXAIY":"gradeSearch,majorSearch","NN3FJMUV":"gradeSearch,majorSearch,authorSearch,sentenceSearch","LSTPFY1C":"gradeSearch,majorSearch,sentenceSearch","HHCPM1F8":"gradeSearch,majorSearch","OORPU5FE":"gradeSearch,majorSearch","WD0FTY92":"gradeSearch,majorSearch,authorSearch,sentenceSearch","BPBAFJ5S":"gradeSearch,majorSearch,authorSearch,sentenceSearch","EMRPGLPA":"gradeSearch,majorSearch","PWFIRAGL":"gradeSearch,majorSearch,sentenceSearch","U8J8LYLV":"gradeSearch,majorSearch","R79MZMCB":"gradeSearch","J708GVCE":"gradeSearch,majorSearch","HR1YT1Z9":"gradeSearch,majorSearch","JUP3MUPD":"gradeSearch,majorSearch,authorSearch,sentenceSearch","NLBO1Z6R":"gradeSearch,majorSearch","RMJLXHZ3":"gradeSearch,majorSearch,sentenceSearch","1UR4K4HZ":"gradeSearch,majorSearch,authorSearch,sentenceSearch","NB3BWEHK":"gradeSearch,majorSearch","XVLO76FD":"gradeSearch,majorSearch"}', "SID_sug": "018104", "knsadvisearchtype": "gDoCkPaXdjDCcKq91XRchA==", "dblang": "both", "createtime-advInput": "2025-04-18 19:02:41", "tfstk": "giejKvftVEYbcWYumZIrARqwR9H_hP6F5hiTxlp2XxHA55Ztz-lauSP71yrKkxrZu33tyRww_qjmCfUTkPGGmVut5PrTggWFLoqmIAp189WEnE9NdrOxMdIt2Did4mn47yPZIAQFzthoY7DgkMoviSERVcoEDdU9HQB-rDAxBqh9yQnKyVHTWqp-2DmpBCntB3ISj4ntBAUtexFkc0dKPoscHjLV4Kl4D499B-Bii2soo0m4FEhS8oeyBdCicjgLD4QAlwhtwlc_IhvsuoF3SDUvkwl0X7aScA6yMvE76yiuH98jqSr8Y4EBjdqxh8wSv8I9pVybak3SdhBYVxgj9-lACGaLVlFEN8SdKxMxk5k0bH_4V-a4m8ZwXQHj37GQhvBHkVPzARi_INX7Rk2uP0UdCZIyUpuCxbA6VXvsV2S5VCAGTyRu2FsN4ghxq0fNVgOnzjnoV2S5VCAiM0mlogsWt45.." } school = pandas.read_csv(os.path.join(root_path,"..","school.csv")) beida = pandas.read_excel(os.path.join(root_path,"..","北核期刊.xlsx")) nanda = pandas.read_excel(os.path.join(root_path,"..","南核期刊.xlsx")) headers = { "accept": "*/*", "accept-encoding": "gzip, deflate, br, zstd", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "origin": "https://kns.cnki.net", "referer": "https://kns.cnki.net/kns8s/AdvSearch?crossids=YSTT4HG0,LSTPFY1C,JUP3MUPD,MPMFIG1A,WQ0UVIAA,BLZOG7CK,PWFIRAGL,EMRPGLPA,NLBO1Z6R,NN3FJMUV", "sec-ch-ua": '"Microsoft Edge";v="135", "Not-A.Brand";v="8", "Chromium";v="135"', "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": '"Windows"', "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Edg/135.0.0.0", "x-requested-with": "XMLHttpRequest" } def start_requests(self): for school in self.school.values: school = school[0] for magazine in self.nanda.values: magazine = magazine[3] zongfenlei=magazine[0][2:] #去掉两个编号 只取分类名称 school = school magazine = magazine data = 'boolSearch=true&QueryJson={"Platform":"","Resource":"CROSSDB","Classid":"WD0FTY92","Products":"","QNode":{"QGroup":[{"Key":"Subject","Title":"","Logic":0,"Items":[],"ChildItems":[{"Key":"input[data-tipid=gradetxt-3]","Title":"作者单位","Logic":0,"Items":[{"Key":"input[data-tipid=gradetxt-3]","Title":"作者单位","Logic":0,"Field":"AF","Operator":"FUZZY","Value":"学校名称","Value2":""}],"ChildItems":[]},{"Key":"input[data-tipid=gradetxt-4]","Title":"文献来源","Logic":0,"Items":[{"Key":"input[data-tipid=gradetxt-4]","Title":"文献来源","Logic":0,"Field":"LY","Operator":"DEFAULT","Value":"杂志名称","Value2":""}],"ChildItems":[]}]},{"Key":"ControlGroup","Title":"","Logic":0,"Items":[],"ChildItems":[]}]},"ExScope":"0","SearchType":1,"Rlang":"CHINESE","KuaKuCode":"YSTT4HG0,LSTPFY1C,JUP3MUPD,MPMFIG1A,WQ0UVIAA,BLZOG7CK,EMRPGLPA,PWFIRAGL,NLBO1Z6R,NN3FJMUV"}&pageNum=1&pageSize=20&dstyle=listmode&boolSortSearch=false&sentenceSearch=false&productStr=YSTT4HG0,LSTPFY1C,RMJLXHZ3,JQIRZIYA,JUP3MUPD,1UR4K4HZ,BPBAFJ5S,R79MZMCB,MPMFIG1A,WQ0UVIAA,NB3BWEHK,XVLO76FD,HR1YT1Z9,BLZOG7CK,EMRPGLPA,J708GVCE,ML4DRIDX,PWFIRAGL,NLBO1Z6R,NN3FJMUV,&aside=(作者单位:学校名称(模糊))AND(文献来源:杂志名称(精确))&searchFrom=资源范围:总库;++时间范围:更新时间:不限;++&CurPage=1' #构造一个知网接口接收POST表单 data = data.replace('学校名称', school).replace('杂志名称', magazine) #自动替换关键词 decoded_form_data = urllib.parse.parse_qs(data) #把URL格式的表单字符串data转换成Scrapy能用的字典形式 data = {key: value[0] if value else "" for key, value in decoded_form_data.items()} yield FormRequest(url='https://kns.cnki.net/kns8s/brief/grid',cookies=self.cookies,headers=self.headers,callback=self.parse,formdata=data,cb_kwargs={'学校':school,'期刊':magazine,'总分类':zongfenlei}) #发送post请求,并交给self.parse() 来处理返回的搜索结果 #对每一个【学校 + 期刊】组合,构造一个 POST 表单,模拟从知网“高级检索”搜索论文数量,并发出请求,交给 parse() 去处理返回结果。 def parse(self, response,**kwargs): response=response.text try: soup = int(bs4.BeautifulSoup(response, 'html.parser').find('span', class_='pagerTitleCell').find( 'em').text.strip()) except: soup=-99 ite=ZhiwangItem() school=kwargs['学校'] magazine=kwargs['期刊'] zongfenlei=kwargs['总分类'] print(school,magazine,soup) ite['学校']=school ite['期刊']=magazine ite['数量']=soup ite['总分类']=zongfenlei ite['属于']='南核' yield ite if __name__ == '__main__': subprocess.run(['scrapy', 'crawl', 'zhiwangspider', '-o', '南核data.csv'])
08-11
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值