- 解析内容
得到的内容可能是HTML,可以用正则表达式,页面解析库进行解析,可能是Json,可以直接转换为Json对象解析,可能是二进制数据,可以做保存或者进一步的处理。
- 正则表达式处理
def get_zhushi_info(url): #urls='https://so.gushiwen.org/shiwen2017/ajaxshiwencont.aspx?id=45c396367f59&value=zhu' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0", "referer": "https://www.gushiwen.org/", 'x-requested-with': 'XMLHttpRequest' } ''' #每个ajax请求要传递的参数 parm = { 'id':'7b561d38e677', 'value':'zhu' } #构造ajax请求url ajax_url = url + urlencode(parm) ''' # 调用ajax请求 response = requests.get(url, headers=headers).content.decode('utf-8') p = r"<p>.+?<br />" p_br = re.findall('<p>(.*?)<br />', response) txtlist=[] for item in p_br: html = etree.HTML(item) txtlist.append(html.xpath('string(.)')) return ''.join(txtlist)
- BeautifulSoup解析处理(BeautifulSoup就是一个专门用来从html和xml文档中提取数据的库)
- XPath解析处理
req = urllib.request.Request(url) req.add_header('User-Agent', header) response_result = urllib.request.urlopen(req).read() html = response_result.decode('utf-8') html = etree.HTML(html) # result=html.xpath('//div[@class="main3"]/div[@class="left"]/div[2]/a/*') author_href = html.xpath( '//div[@class="main3"]/div[@class="right"]/div[@class="sons"]/div[@class="cont"]/a/@href') authors = html.xpath( '//div[@class="main3"]/div[@class="right"]/div[@class="sons"]/div[@class="cont"]/a/text()') for author,href in zip(authors,author_href): with open(cipai_path+author+'.txt','w',encoding='utf-8') as wf: author_id=href.split('_')[1].split('.aspx')[0] wf.writelines('https://so.gushiwen.org/authors/authorvsw_'+author_id+'A1.aspx')