JS- 操作 XML(兼容IE/FF/Chrome,Safari)

本文介绍了一段使用JavaScript读取XML文件并输出节点及属性的代码,适用于IE、FF、Chrome、Safari等浏览器,包括如何处理XML文件为空的情况。
今天同事有需要JS显示XML!
后来就做了一个效果!或许以后能用的着呢! ^_^
JS读取并输出xml文件节点及属性(兼容IE,FF) !
好东东发出来分享一下了!
代码如下:
Html代码 

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
<title>JS读取XML兼容IE/FF/Chrome,Safari</title>
<style>
li{list-style:none;}
</style>
<script type="text/javascript">
//==============================================================
var orderDoc;
//获取xml文件
function loadXmlFile(xmlFile){
var xmlDom = null;
if (window.ActiveXObject){
xmlDom = new ActiveXObject("Microsoft.XMLDOM");
xmlDom.async=false;
xmlDom.load(xmlFile)||xmlDom.loadXML(xmlFile);//如果用的是XML字符串//如果用的是xml文件
}else if (document.implementation && document.implementation.createDocument){
var xmlhttp = new window.XMLHttpRequest();
xmlhttp.open("GET", xmlFile, false);
xmlhttp.send(null);
xmlDom = xmlhttp.responseXML;
}else{
xmlDom = null;
}
return xmlDom;
}
//判断子节点为不为空
function isnull(obj)
{
var nodevalue = "";
if(obj.childNodes[0] != null)
{
nodevalue =obj.childNodes[0].nodeValue;
}
return nodevalue;
}
var stringsss="";
//根据编号获取数据
function getDataByid(number)
{
var time =isnull(orderDoc.getElementsByTagName("time")[number]);
var place =isnull(orderDoc.getElementsByTagName("place")[number]);
var reason =isnull(orderDoc.getElementsByTagName("reason")[number]);
var unit = isnull(orderDoc.getElementsByTagName("unit")[number]);
var money =isnull(orderDoc.getElementsByTagName("money")[number]);
var status =isnull(orderDoc.getElementsByTagName("status")[number]);
var docNum =isnull(orderDoc.getElementsByTagName("docNum")[number]);
var score =isnull(orderDoc.getElementsByTagName("score")[number]);
var std='<li><b>网址:'+(time)+'<br />站名称:'+(place)+'<br />描述:'+(reason)+'</b></li>';
return std;
}

//获得页面内容
function getContent(){
//exBrows();
orderDoc=loadXmlFile("http://www.ok22.org/download/ex2.xml");
var items=orderDoc.getElementsByTagName("results").length;
//alert(orderDoc.getElementsByTagName("Item").length);
var htmlstr="";
stringsss+='<ul>';
for(i=0;i<items;i++){
stringsss+=getDataByid(i);
}
stringsss+='</ul>'
document.write(stringsss);
}
</script>

</head>

<body>
</body>
<script type="text/javascript" language="javascript">
getContent();
</script>
</html>


因为XML有可能为空所以添加了一个判断节点是否为空!
Js代码 

//判断子节点为不为空
function isnull(obj)
{
var nodevalue = "";
if(obj.childNodes[0] != null)
{
nodevalue =obj.childNodes[0].nodeValue;
}
return nodevalue;
}



转自:http://www.ok22.org/art_detail.aspx?id=137
帮我检查我的代码,# !/usr/bin/env python -- coding: utf-8 -- ---------------------- ‘’‘’‘’ import requests from lxml import etree def spider(): # 发送请求 start_url = ‘https://www.property.com.au/nsw/mosman-2088/alexander-ave/1-pid-27882/’ headers = { ‘accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7’, ‘accept-language’: ‘zh-CN,zh;q=0.9,en;q=0.8’, ‘cache-control’: ‘no-cache’, ‘pragma’: ‘no-cache’, ‘priority’: ‘u=0, i’, ‘referer’: ‘https://www.property.com.au/nsw/mosman-2088/alexander-ave/’, ‘upgrade-insecure-requests’: ‘1’, ‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36’, ‘cookie’: ‘pcauid=b1863899-35a3-4e61-8986-84998f585702; split_audience=e; _gcl_au=1.1.975648010.1755478990; _ga=GA1.1.776858578.1755478991; hubspotutk=6d1130599ac0be87140a985bd2f71d9a; __gads=ID=33d262c937dc2ff8:T=1755480673:RT=1755480673:S=ALNI_MY-ft-_2i3hA6QJ77JPXKm93K6dBg; __gpi=UID=00001181eb368bb0:T=1755480673:RT=1755480673:S=ALNI_MbeNX-ZTOUv1i8a1VpUj7cXsEmzTg; __eoi=ID=c7871334faad58bc:T=1755480673:RT=1755480673:S=AA-AfjbYXeZ21klLjEocX7eJIKPC; Country=CN; _sp_ses.6d38=*; AMCVS_341225BE55BBF7E17F000101%40AdobeOrg=1; AMCV_341225BE55BBF7E17F000101%40AdobeOrg=179643557%7CMCIDTS%7C20324%7CMCMID%7C12784190570061788621030718138656103977%7CMCAAMLH-1756606032%7C11%7CMCAAMB-1756606032%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1756008432s%7CNONE%7CMCAID%7CNONE%7CMCSYNCSOP%7C411-20326%7CvVersion%7C5.5.0; s_cc=true; ab.storage.deviceId.b2062aeb-858c-41cc-b98c-ff2aaec84951=%7B%22g%22%3A%22b0b06b00-0972-9819-8c9f-746ab4c79462%22%2C%22c%22%3A1755355392674%2C%22l%22%3A1756001235226%7D; DM_SitId1467=1; DM_SitId1467SecId12715=1; __hstc=80006152.6d1130599ac0be87140a985bd2f71d9a.1755479006040.1755929837815.1756001237383.5; __hssrc=1; legs_sq=%5B%5BB%5D%5D; s_sq=%5B%5BB%5D%5D; KP_UIDz-ssn=0wBoJjBpw91SVcixUwEgisdPblQEEdH4W23KaMPbIBfJ5gil4wT66Nciuh2zTPDR9fGMgc0yzGvdeQ4a43EdRDdYBHLXGupqDdoCTn0XGKyMM48n6yIiLfsHLVew7dBGSBfuY5aQFEJphzTM1ipSZRCSCP6zufzK5GqPHoLuf7Wn6jKTWfRVJmzObL17b0CCJvNEi1lNWs2; KP_UIDz=0wBoJjBpw91SVcixUwEgisdPblQEEdH4W23KaMPbIBfJ5gil4wT66Nciuh2zTPDR9fGMgc0yzGvdeQ4a43EdRDdYBHLXGupqDdoCTn0XGKyMM48n6yIiLfsHLVew7dBGSBfuY5aQFEJphzTM1ipSZRCSCP6zufzK5GqPHoLuf7Wn6jKTWfRVJmzObL17b0CCJvNEi1lNWs2; utag_main=v_id:0198bab39a9900128148666054590506f003a06700978$_sn:7$_se:8$_ss:0$_st:1756004155608$vapi_domain:property.com.au$ses_id:1756001232214%3Bexp-session$_pn:6%3Bexp-session$_prevpage:pca%3Aproperty_research%3Adetails%3Aproperty_details%3Bexp-1756005957090; s_nr30=1756002357093-Repeat; _sp_id.6d38=dea2f644-5418-4b60-a4f5-139bcf6c1868.1755478990.7.1756002357.1755930417.4294beab-7742-45c6-923a-fcb8fbd62155; _ga_FNS14GD7LR=GS2.1.s1756001233$o8$g1$t1756002357$j52$l0$h0; ab.storage.sessionId.b2062aeb-858c-41cc-b98c-ff2aaec84951=%7B%22g%22%3A%22b080677c-f76c-64ec-7bb6-7c78607523d9%22%2C%22e%22%3A1756004158529%2C%22c%22%3A1756001235224%2C%22l%22%3A1756002358529%7D; nol_fpid=vcvriivr6sqwmjqcremcsjaypx6dt1755478991|1755478991810|1756002360561|1756002360878; __hssc=80006152.6.1756001237383’, } response = requests.get(start_url, headers=headers).text parse_response(response) def parse_response(response): # 解析响应 - 数据清洗和过滤 # print(response) A = etree.HTML(response) # 1. 提取标题和副标题 title_list = A.xpath(‘//h1[@class=“Stack__StackContainer-sc-agfvon-0 fNHYjU”]/span//text()) # print(len(title_list)) if len(title_list) != 1: title = title_list[0] subtitle = title_list[-1] # 提取房产描述 description = A.xpath('//div[contains(@class, "PropertyDescriptionBrick")]/span//text()') if description: description = ' '.join([d.strip() for d in description if d.strip()]) # 提取物业时间表 # 提取所有的标题选项 primary_schools = A.xpath('//p[@class="ContentBrick__SubtitleWithDynamicContent-sc-1sfxg8l-4 ipEFVW"]//text()') sections = A.xpath('//section[contains(@class, "ContentBrick")]') for section in sections: section_title = section.xpath('.//h2/text()') if section_title: section_title = section_title[0].strip() # 学校信息 if "Nearby primary schools" in section_title: primary_schools = section.xpath('//p[@class="ContentBrick__SubtitleWithDynamicContent-sc-1sfxg8l-4 ipEFVW"]//text()') for primary_school in primary_schools: item_result = ''.join(primary_school.strip()) table_head = section.xpath('//div[@class="SchoolsAccordionTable__DesktopSchoolFactHeaderWrapper-sc-1w3ewmz-0 bedwwe"]//text()') table_head = list(dict.fromkeys(table_head)) table_lists = section.xpath('//div[@class="Accordion__Wrapper-sc-310oi8-0 WdYBm"]/div') for table in table_lists: h1 = table.xpath('//*[@id="header-school-accordion-item-Mosman-Public-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h2 = table.xpath('//*[@id="header-school-accordion-item-Blessed-Sacrament-Catholic-Primary-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h3 = table.xpath('//*[@id="header-school-accordion-item-Queenwood-Junior-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h4 = table.xpath('//*[@id="header-school-accordion-item-Mosman-Church-of-England-Preparatory-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h5 = table.xpath('//*[@id="header-school-accordion-item-Queenwood,-Art-and-Design-Campus"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') if "Nearby secondary schools" in section_title: secondary_schools = section.xpath('//p[@class="ContentBrick__SubtitleWithDynamicContent-sc-1sfxg8l-4 ipEFVW"]//text()') for secondary_school in secondary_schools: item_result = ''.join(secondary_school.strip()) table_head = section.xpath('//div[@class="SchoolsAccordionTable__DesktopSchoolFactHeaderWrapper-sc-1w3ewmz-0 bedwwe"]//text()') table_head = list(dict.fromkeys(table_head)) table_lists = section.xpath('//div[@class="Accordion__Wrapper-sc-310oi8-0 WdYBm"]/div') for table in table_lists: h1 = table.xpath('//*[@id="header-school-accordion-item-Mosman-Public-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h2 = table.xpath('//*[@id="header-school-accordion-item-Blessed-Sacrament-Catholic-Primary-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h3 = table.xpath('//*[@id="header-school-accordion-item-Queenwood-Junior-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h4 = table.xpath('//*[@id="header-school-accordion-item-Mosman-Church-of-England-Preparatory-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h5 = table.xpath('//*[@id="header-school-accordion-item-Queenwood,-Art-and-Design-Campus"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') all_sections_info = [] section_lists = A.xpath('//div[@class="PageContainer-sc-1bbzxlg-0 PropertyPageMainContent__PropertyPageMainBodyContentContainer-sc-44j84k-0 eTUAAl kwOMda"]/section') for section in section_lists[1: ]: key_value_lists = section.xpath('./div[@class="ContentBrick__BrickHeader-sc-1sfxg8l-1 iarlJC"]/div[@class="ContentBrick__TitlesWrapper-sc-1sfxg8l-2 kjhcFm"]//text()') other_text = section.xpath('./p[@class="Text__Typography-sc-1103tao-0 bTArky PropertyValuationBrick__Caption-sc-v7nzs2-1 ciEIJc"]//text()') # print(other_text) sepc_list = section.xpath('./div[@class="PropertyValuationBrick__PropertyValuationSubBrickContainer-sc-v7nzs2-0 kRtQPr"]/section') for sepc in sepc_list: small_text = sepc.xpath('./div//text()') # print(small_text) time_list = section.xpath('./div[@class="PropertyTimelineBeforeYouBuy__Container-sc-1orfm55-3 iGrzlx"]/div[@class="PropertyTimelineBeforeYouBuy__Card-sc-1orfm55-4 glseQh"]') for time in time_list: box_text = time.xpath('./div[@class="PropertyTimelineBeforeYouBuy__Card-sc-1orfm55-4 glseQh"]//text()') final_text = time.xpath('./iv[@class="PropertyTimelineBeforeYouBuy__DynamicDisclaimer-sc-1orfm55-8 bArUbh"]//text()') final_text = ''.join(final_text) # print(final_text) # 补充确实的key对应的value span_list = section.xpath('./div[@class="PropertyDescriptionBrickWithData__Description-sc-qqi42z-0 jRcBlF"]//text()') lack_value = '' if span_list: lack_value = ''.join(span.strip() for span in span_list if span.strip()) if 'About the property' in key_value_lists: key_value_lists.append(lack_value) all_sections_info.append(key_value_lists) # for info in all_sections_info: # print(info) def main(): spider() if name == ‘main’: main(),我拿到的primary_schools的值和secondary_schools的值相同,都是secondary_schools对应的值,可能if “Nearby primary schools” in section_title: primary_schools = section.xpath(‘//p[@class=“ContentBrick__SubtitleWithDynamicContent-sc-1sfxg8l-4 ipEFVW”]//text())这边的问题,但页面结构是这样的,
最新发布
08-25
帮我检查我的代码,# !/usr/bin/env python # -*- coding: utf-8 -*- # ---------------------- '''''' import requests from lxml import etree def spider(): # 发送请求 start_url = 'https://www.property.com.au/nsw/mosman-2088/alexander-ave/1-pid-27882/' headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'priority': 'u=0, i', 'referer': 'https://www.property.com.au/nsw/mosman-2088/alexander-ave/', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36', 'cookie': 'pcauid=b1863899-35a3-4e61-8986-84998f585702; split_audience=e; _gcl_au=1.1.975648010.1755478990; _ga=GA1.1.776858578.1755478991; hubspotutk=6d1130599ac0be87140a985bd2f71d9a; __gads=ID=33d262c937dc2ff8:T=1755480673:RT=1755480673:S=ALNI_MY-ft-_2i3hA6QJ77JPXKm93K6dBg; __gpi=UID=00001181eb368bb0:T=1755480673:RT=1755480673:S=ALNI_MbeNX-ZTOUv1i8a1VpUj7cXsEmzTg; __eoi=ID=c7871334faad58bc:T=1755480673:RT=1755480673:S=AA-AfjbYXeZ21klLjEocX7eJIKPC; Country=CN; _sp_ses.6d38=*; AMCVS_341225BE55BBF7E17F000101%40AdobeOrg=1; AMCV_341225BE55BBF7E17F000101%40AdobeOrg=179643557%7CMCIDTS%7C20324%7CMCMID%7C12784190570061788621030718138656103977%7CMCAAMLH-1756606032%7C11%7CMCAAMB-1756606032%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1756008432s%7CNONE%7CMCAID%7CNONE%7CMCSYNCSOP%7C411-20326%7CvVersion%7C5.5.0; s_cc=true; ab.storage.deviceId.b2062aeb-858c-41cc-b98c-ff2aaec84951=%7B%22g%22%3A%22b0b06b00-0972-9819-8c9f-746ab4c79462%22%2C%22c%22%3A1755355392674%2C%22l%22%3A1756001235226%7D; DM_SitId1467=1; DM_SitId1467SecId12715=1; __hstc=80006152.6d1130599ac0be87140a985bd2f71d9a.1755479006040.1755929837815.1756001237383.5; __hssrc=1; legs_sq=%5B%5BB%5D%5D; s_sq=%5B%5BB%5D%5D; KP_UIDz-ssn=0wBoJjBpw91SVcixUwEgisdPblQEEdH4W23KaMPbIBfJ5gil4wT66Nciuh2zTPDR9fGMgc0yzGvdeQ4a43EdRDdYBHLXGupqDdoCTn0XGKyMM48n6yIiLfsHLVew7dBGSBfuY5aQFEJphzTM1ipSZRCSCP6zufzK5GqPHoLuf7Wn6jKTWfRVJmzObL17b0CCJvNEi1lNWs2; KP_UIDz=0wBoJjBpw91SVcixUwEgisdPblQEEdH4W23KaMPbIBfJ5gil4wT66Nciuh2zTPDR9fGMgc0yzGvdeQ4a43EdRDdYBHLXGupqDdoCTn0XGKyMM48n6yIiLfsHLVew7dBGSBfuY5aQFEJphzTM1ipSZRCSCP6zufzK5GqPHoLuf7Wn6jKTWfRVJmzObL17b0CCJvNEi1lNWs2; utag_main=v_id:0198bab39a9900128148666054590506f003a06700978$_sn:7$_se:8$_ss:0$_st:1756004155608$vapi_domain:property.com.au$ses_id:1756001232214%3Bexp-session$_pn:6%3Bexp-session$_prevpage:pca%3Aproperty_research%3Adetails%3Aproperty_details%3Bexp-1756005957090; s_nr30=1756002357093-Repeat; _sp_id.6d38=dea2f644-5418-4b60-a4f5-139bcf6c1868.1755478990.7.1756002357.1755930417.4294beab-7742-45c6-923a-fcb8fbd62155; _ga_FNS14GD7LR=GS2.1.s1756001233$o8$g1$t1756002357$j52$l0$h0; ab.storage.sessionId.b2062aeb-858c-41cc-b98c-ff2aaec84951=%7B%22g%22%3A%22b080677c-f76c-64ec-7bb6-7c78607523d9%22%2C%22e%22%3A1756004158529%2C%22c%22%3A1756001235224%2C%22l%22%3A1756002358529%7D; nol_fpid=vcvriivr6sqwmjqcremcsjaypx6dt1755478991|1755478991810|1756002360561|1756002360878; __hssc=80006152.6.1756001237383', } response = requests.get(start_url, headers=headers).text parse_response(response) def parse_response(response): # 解析响应 - 数据清洗和过滤 # print(response) A = etree.HTML(response) # 1. 提取标题和副标题 title_list = A.xpath('//h1[@class="Stack__StackContainer-sc-agfvon-0 fNHYjU"]/span//text()') # print(len(title_list)) if len(title_list) != 1: title = title_list[0] subtitle = title_list[-1] # 提取房产描述 description = A.xpath('//div[contains(@class, "PropertyDescriptionBrick")]/span//text()') if description: description = ' '.join([d.strip() for d in description if d.strip()]) # 提取物业时间表 # 提取所有的标题选项 primary_schools = A.xpath('//p[@class="ContentBrick__SubtitleWithDynamicContent-sc-1sfxg8l-4 ipEFVW"]//text()') sections = A.xpath('//section[contains(@class, "ContentBrick")]') for section in sections: section_title = section.xpath('.//h2/text()') if section_title: section_title = section_title[0].strip() # 学校信息 if "Nearby primary schools" in section_title: primary_schools = section.xpath('//p[@class="ContentBrick__SubtitleWithDynamicContent-sc-1sfxg8l-4 ipEFVW"]//text()') for primary_school in primary_schools: item_result = ''.join(primary_school.strip()) table_head = section.xpath('//div[@class="SchoolsAccordionTable__DesktopSchoolFactHeaderWrapper-sc-1w3ewmz-0 bedwwe"]//text()') table_head = list(dict.fromkeys(table_head)) table_lists = section.xpath('//div[@class="Accordion__Wrapper-sc-310oi8-0 WdYBm"]/div') for table in table_lists: h1 = table.xpath('//*[@id="header-school-accordion-item-Mosman-Public-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h2 = table.xpath('//*[@id="header-school-accordion-item-Blessed-Sacrament-Catholic-Primary-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h3 = table.xpath('//*[@id="header-school-accordion-item-Queenwood-Junior-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h4 = table.xpath('//*[@id="header-school-accordion-item-Mosman-Church-of-England-Preparatory-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h5 = table.xpath('//*[@id="header-school-accordion-item-Queenwood,-Art-and-Design-Campus"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') if "Nearby secondary schools" in section_title: secondary_schools = section.xpath('//p[@class="ContentBrick__SubtitleWithDynamicContent-sc-1sfxg8l-4 ipEFVW"]//text()') for secondary_school in secondary_schools: item_result = ''.join(secondary_school.strip()) table_head = section.xpath('//div[@class="SchoolsAccordionTable__DesktopSchoolFactHeaderWrapper-sc-1w3ewmz-0 bedwwe"]//text()') table_head = list(dict.fromkeys(table_head)) table_lists = section.xpath('//div[@class="Accordion__Wrapper-sc-310oi8-0 WdYBm"]/div') for table in table_lists: h1 = table.xpath('//*[@id="header-school-accordion-item-Mosman-Public-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h2 = table.xpath('//*[@id="header-school-accordion-item-Blessed-Sacrament-Catholic-Primary-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h3 = table.xpath('//*[@id="header-school-accordion-item-Queenwood-Junior-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h4 = table.xpath('//*[@id="header-school-accordion-item-Mosman-Church-of-England-Preparatory-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') h5 = table.xpath('//*[@id="header-school-accordion-item-Queenwood,-Art-and-Design-Campus"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') all_sections_info = [] section_lists = A.xpath('//div[@class="PageContainer-sc-1bbzxlg-0 PropertyPageMainContent__PropertyPageMainBodyContentContainer-sc-44j84k-0 eTUAAl kwOMda"]/section') for section in section_lists[1: ]: key_value_lists = section.xpath('./div[@class="ContentBrick__BrickHeader-sc-1sfxg8l-1 iarlJC"]/div[@class="ContentBrick__TitlesWrapper-sc-1sfxg8l-2 kjhcFm"]//text()') other_text = section.xpath('./p[@class="Text__Typography-sc-1103tao-0 bTArky PropertyValuationBrick__Caption-sc-v7nzs2-1 ciEIJc"]//text()') # print(other_text) sepc_list = section.xpath('./div[@class="PropertyValuationBrick__PropertyValuationSubBrickContainer-sc-v7nzs2-0 kRtQPr"]/section') for sepc in sepc_list: small_text = sepc.xpath('./div//text()') # print(small_text) time_list = section.xpath('./div[@class="PropertyTimelineBeforeYouBuy__Container-sc-1orfm55-3 iGrzlx"]/div[@class="PropertyTimelineBeforeYouBuy__Card-sc-1orfm55-4 glseQh"]') for time in time_list: box_text = time.xpath('./div[@class="PropertyTimelineBeforeYouBuy__Card-sc-1orfm55-4 glseQh"]//text()') final_text = time.xpath('./iv[@class="PropertyTimelineBeforeYouBuy__DynamicDisclaimer-sc-1orfm55-8 bArUbh"]//text()') final_text = ''.join(final_text) # print(final_text) # 补充确实的key对应的value span_list = section.xpath('./div[@class="PropertyDescriptionBrickWithData__Description-sc-qqi42z-0 jRcBlF"]//text()') lack_value = '' if span_list: lack_value = ''.join(span.strip() for span in span_list if span.strip()) if 'About the property' in key_value_lists: key_value_lists.append(lack_value) all_sections_info.append(key_value_lists) # for info in all_sections_info: # print(info) def main(): spider() if __name__ == '__main__': main(),我拿到的primary_schools的值和secondary_schools的值相同,都是secondary_schools对应的值,可能if "Nearby primary schools" in section_title: primary_schools = section.xpath('//p[@class="ContentBrick__SubtitleWithDynamicContent-sc-1sfxg8l-4 ipEFVW"]//text()')这边的问题,但页面结构是这样的,
08-25
import requests from bs4 import BeautifulSoup import re import os import json from datetime import datetime # 目标URL url = "https://listing.szse.cn/projectdynamic/ipo/detail/index.html?id=1003423" # 创建存储目录 if not os.path.exists('./ipo_documents'): os.makedirs('./ipo_documents') # 请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Connection': 'keep-alive', 'Referer': 'https://listing.szse.cn/disclosure/ipo/index.html' } def extract_document_links(html_content): """从HTML内容中提取文档链接""" soup = BeautifulSoup(html_content, 'html.parser') # 查找所有包含文档的表格行 rows = soup.select('tr') documents = [] current_category = None for row in rows: # 检查是否是类别行(包含<th>标签) if row.find('th'): current_category = row.get_text(strip=True) continue # 检查是否是文档行 if row.find('td'): tds = row.find_all('td') # 确保有足够的列 if len(tds) > 1: doc_name = tds[0].get_text(strip=True) # 处理日期和链接列 date_links = [] for td in tds[1:]: # 查找所有链接 for link in td.find_all('a', href=True): date_text = link.get_text(strip=True) file_url = link['href'] # 清理URL中的空格和特殊字符 file_url = re.sub(r'\s+', '', file_url) file_url = file_url.replace('”', '').replace('"', '') # 跳过无效URL if file_url.startswith('http'): date_links.append({ 'date': date_text, 'url': file_url }) # 添加到文档列表 if doc_name and date_links: for dl in date_links: documents.append({ 'category': current_category, 'document_name': doc_name, 'publish_date': dl['date'], 'file_url': dl['url'] }) return documents def main(): print(f"开始爬取IPO项目详情页: {url}") try: # 发送HTTP请求 response = requests.get(url, headers=headers) response.raise_for_status() # 检查请求是否成功 # 解析文档链接 documents = extract_document_links(response.text) if documents: # 创建结果文件名 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_file = f"./ipo_documents/project_1003423_{timestamp}.json" # 保存结果到JSON文件 with open(output_file, 'w', encoding='utf-8') as f: json.dump(documents, f, ensure_ascii=False, indent=2) print(f"\n爬取完成! 共找到 {len(documents)} 个文档链接") print(f"结果已保存到: {output_file}") # 打印前3个结果示例 print("\n文档链接示例:") for i, doc in enumerate(documents[:3]): print(f"{i + 1}. {doc['category']} - {doc['document_name']}") print(f" 发布日期: {doc['publish_date']}") print(f" 文件URL: {doc['file_url']}\n") else: print("未找到任何文档链接") except requests.exceptions.RequestException as e: print(f"请求失败: {str(e)}") except Exception as e: print(f"发生错误: {str(e)}") if __name__ == '__main__': main() 修改代码,帮我提取出下列html中的pdf链接加到代码中,让运行后的代码下载出pdf文件进行存放。<td>招股说明书</td) ▼<td class="text-center target="_blank' ”href="httns://reDortdocs.static.sZse.cn/UpFi1es/rasinfodisc1/202306/RAS 202306 EFF55B536A0445029A7FC26A9AFDC151.Ddf" style="display:block;">2023-06-27</a <a target="_b1ank"href="https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202312/RAS 202312 31160583E7E90B12654185BD6F9CCCA47976D6.pdf" href= style="display:block;">2023-12-31</a <a target= "_blank href="https://reDortdocs.static.szse.cn/UpFi1es/rasinfodisc1/202403/RAS 202403 1518253651462FDD5744BFBF999BDCA530BBD1.Ddf" style="display:block;">2024-03-15</a <a target="_b1ank”href="https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202407/RAS 202407 2521201348F597A7C0408EAA0147A41BBA30E8.pdf" sty1e="display:b1ock;">2024-07-25</a /a target= "blank' href="httDs://reDortdocs.statiC.SZse.Cn/UpFi1es/rasinf9disc1/202409/RAS 202409 111700A6910513E1C7459DBC9F23DA12258A8E.Ddf” style="display:block;">2024-09-11</a /2 target="blank" href= "https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202503/RAS_202503_2021351592FBC4C28B461CB055581BCF516C3E.pdf ”sty1e="disp1ay:b1ock;">2025-03-20</a </td ▼<td class="text-center atarget=”_b1ank”href="https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202506/RAS 202506 1021050A714A3ED75944E597BBAED9456B6037.pdf" style="display:block;">2025-06-10</a </td ▼<td class="text-center") target="_blank" href="https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202507/RAS 202507 041725E36EB4D843004F0997B334B64E5003BA. pdf” sty1e="disp1ay:b1ock;">2025-07-04</a </td </tr> (tr <td>发行保荐书</td> ▼<td class="text-center") target=”blank”href="https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202306/RAS 202306 58E303FEE6224DA2BC2F9E3E6AFA0AD7.pdf”style="display:block:">2023-06-27</a) <a target="_blank"href="https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202312/RAS 202312 311605D9B366F45DDF48309F197AA265B20214.pdf”style="display:block;">2023-12-31</a <a target=”_blank”href="https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202403/RAS 202403 151825E9082653339C4865B0AACF747572A8C6.pdf” sty1e="disp1ay:b1ock:">2024-03-15</a <a target="_blank”href="https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202407/RAS_202407 252120477EFCFD25BF4D50B92F00170E5260C0.pdf" style="disp1ay:b1ock;">2024-07-25</a /8 target="_blank" "href="https://reP0rtdocs.static.Szse.cn/UpFi1es/rasinfodisc1/202409/RAS 202409 1117006A832FC0F83C4E028E66B7FF1A096760.pdf" sty1e="display:block;">2024-09-11</a <a target="_b1ank”href="https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202503/RAS 202503 202135865CD83D4D6E49F1809648ACAAD8E5E7.pdf" target= style="display:block;">2025-03-20</a </td) ▼<td class="text-center" target=”_blank”href="https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202506/RAS 202506 102105190C786D6F404C0DA93711F9C5B62411.pdf”style="disp1ay:b1ock:">2025-06-10</a </td) ▼<td class="text-center" target="_b1ank”href="https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202507/RAS_202507 041725EA0CD092A7754891A7BA76821E20494E.pdf”style="disp1ay:b1ock;">2025-07-04</a </td </tr) <trhttps://listing.szse.cn/projectdynamic/ipo/detail/index.html?id=1003423 ,帮我爬取网址中的披露文件的链接等相关信息,帮我写出具体的python代码。
07-30
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值