The final day on the schools

作者即将结束学生生涯,前往北京开始工作。在最后一天,他陪伴女友度过,并试图让彼此感到快乐。晚餐时的一番对话揭示了两人内心的真实感受,充满了离别前的不舍与感伤。

today is my final day of my student live

tomorrow i well go to beijing join in my work

 on the last day  my a lot time use  for accompany my gf

today she like very happy

but i know this is don't really

on dinner i tell he

why you so happy? because i leave?

and.....?

on the time  gf' start tears

no i want happy because i want you happy  she say

i felling very very sad

you don't wnat i farewell for you? you want leave me on my sleep? she ask

i konw sth is very sad

but i don't konw this is starting on my live

爬到的这种数据我要怎么根据他们的特征把他们进行组装,其中我拿到的是一个大列表,下面是我的代码:# !/usr/bin/env python # -*- coding: utf-8 -*- # ---------------------- '''''' ''' # table_head = section.xpath('.//div[@class="SchoolsAccordionTable__DesktopSchoolFactHeaderWrapper-sc-1w3ewmz-0 bedwwe"]//text()') # table_head = list(dict.fromkeys(table_head)) # table_lists = section.xpath('//div[@class="Accordion__Wrapper-sc-310oi8-0 WdYBm"]/div') # for table in table_lists: # h1 = table.xpath('//*[@id="header-school-accordion-item-Mosman-Public-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') # h2 = table.xpath('//*[@id="header-school-accordion-item-Blessed-Sacrament-Catholic-Primary-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') # h3 = table.xpath('//*[@id="header-school-accordion-item-Queenwood-Junior-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') # h4 = table.xpath('//*[@id="header-school-accordion-item-Mosman-Church-of-England-Preparatory-School"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') # h5 = table.xpath('//*[@id="header-school-accordion-item-Queenwood,-Art-and-Design-Campus"]/div[@class="SchoolAccordionItemstyled__TitleWrapper-sc-1p6c5y1-1 caYcSe"]//text()') ''' import requests from lxml import etree def spider(): # 发送请求 start_url = 'https://www.property.com.au/nsw/mosman-2088/alexander-ave/1-pid-27882/' headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8', 'cache-control': 'no-cache', 'pragma': 'no-cache', 'priority': 'u=0, i', 'referer': 'https://www.property.com.au/nsw/mosman-2088/alexander-ave/', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36', 'cookie': 'pcauid=b1863899-35a3-4e61-8986-84998f585702; split_audience=e; _gcl_au=1.1.975648010.1755478990; _ga=GA1.1.776858578.1755478991; hubspotutk=6d1130599ac0be87140a985bd2f71d9a; __gads=ID=33d262c937dc2ff8:T=1755480673:RT=1755480673:S=ALNI_MY-ft-_2i3hA6QJ77JPXKm93K6dBg; __gpi=UID=00001181eb368bb0:T=1755480673:RT=1755480673:S=ALNI_MbeNX-ZTOUv1i8a1VpUj7cXsEmzTg; __eoi=ID=c7871334faad58bc:T=1755480673:RT=1755480673:S=AA-AfjbYXeZ21klLjEocX7eJIKPC; Country=CN; _sp_ses.6d38=*; AMCVS_341225BE55BBF7E17F000101%40AdobeOrg=1; AMCV_341225BE55BBF7E17F000101%40AdobeOrg=179643557%7CMCIDTS%7C20324%7CMCMID%7C12784190570061788621030718138656103977%7CMCAAMLH-1756606032%7C11%7CMCAAMB-1756606032%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1756008432s%7CNONE%7CMCAID%7CNONE%7CMCSYNCSOP%7C411-20326%7CvVersion%7C5.5.0; s_cc=true; ab.storage.deviceId.b2062aeb-858c-41cc-b98c-ff2aaec84951=%7B%22g%22%3A%22b0b06b00-0972-9819-8c9f-746ab4c79462%22%2C%22c%22%3A1755355392674%2C%22l%22%3A1756001235226%7D; DM_SitId1467=1; DM_SitId1467SecId12715=1; __hstc=80006152.6d1130599ac0be87140a985bd2f71d9a.1755479006040.1755929837815.1756001237383.5; __hssrc=1; legs_sq=%5B%5BB%5D%5D; s_sq=%5B%5BB%5D%5D; KP_UIDz-ssn=0wBoJjBpw91SVcixUwEgisdPblQEEdH4W23KaMPbIBfJ5gil4wT66Nciuh2zTPDR9fGMgc0yzGvdeQ4a43EdRDdYBHLXGupqDdoCTn0XGKyMM48n6yIiLfsHLVew7dBGSBfuY5aQFEJphzTM1ipSZRCSCP6zufzK5GqPHoLuf7Wn6jKTWfRVJmzObL17b0CCJvNEi1lNWs2; KP_UIDz=0wBoJjBpw91SVcixUwEgisdPblQEEdH4W23KaMPbIBfJ5gil4wT66Nciuh2zTPDR9fGMgc0yzGvdeQ4a43EdRDdYBHLXGupqDdoCTn0XGKyMM48n6yIiLfsHLVew7dBGSBfuY5aQFEJphzTM1ipSZRCSCP6zufzK5GqPHoLuf7Wn6jKTWfRVJmzObL17b0CCJvNEi1lNWs2; utag_main=v_id:0198bab39a9900128148666054590506f003a06700978$_sn:7$_se:8$_ss:0$_st:1756004155608$vapi_domain:property.com.au$ses_id:1756001232214%3Bexp-session$_pn:6%3Bexp-session$_prevpage:pca%3Aproperty_research%3Adetails%3Aproperty_details%3Bexp-1756005957090; s_nr30=1756002357093-Repeat; _sp_id.6d38=dea2f644-5418-4b60-a4f5-139bcf6c1868.1755478990.7.1756002357.1755930417.4294beab-7742-45c6-923a-fcb8fbd62155; _ga_FNS14GD7LR=GS2.1.s1756001233$o8$g1$t1756002357$j52$l0$h0; ab.storage.sessionId.b2062aeb-858c-41cc-b98c-ff2aaec84951=%7B%22g%22%3A%22b080677c-f76c-64ec-7bb6-7c78607523d9%22%2C%22e%22%3A1756004158529%2C%22c%22%3A1756001235224%2C%22l%22%3A1756002358529%7D; nol_fpid=vcvriivr6sqwmjqcremcsjaypx6dt1755478991|1755478991810|1756002360561|1756002360878; __hssc=80006152.6.1756001237383', } response = requests.get(start_url, headers=headers).text parse_response(response) def parse_response(response): # 解析响应 - 数据清洗和过滤 # print(response) A = etree.HTML(response) # 1. 提取标题和副标题 title_list = A.xpath('//h1[@class="Stack__StackContainer-sc-agfvon-0 fNHYjU"]/span//text()') # print(len(title_list)) if len(title_list) != 1: title = title_list[0] subtitle = title_list[-1] # 提取房产描述 description = A.xpath('//div[contains(@class, "PropertyDescriptionBrick")]/span//text()') if description: description = ' '.join([d.strip() for d in description if d.strip()]) # 提取物业时间表 # 提取所有的标题选项 primary_schools = A.xpath('//p[@class="ContentBrick__SubtitleWithDynamicContent-sc-1sfxg8l-4 ipEFVW"]//text()') sections = A.xpath('//section[contains(@class, "ContentBrick")]') # 分别存储小学和中学信息 and .//h2[contains(text(), "Nearby")]] primary_schools_info = {"description": "", "schools": []} secondary_schools_info = {"description": "", "schools": []} for section in sections: section_title = section.xpath('.//h2/text()') if not section_title: continue section_title = section_title[0].strip() # 学校信息 if "primary" in section_title.lower(): # 提取小学描述 primary_schools = section.xpath('.//p[@class="ContentBrick__SubtitleWithDynamicContent-sc-1sfxg8l-4 ipEFVW"]//text()') if primary_schools: primary_schools_info['description'] = " ".join([primary_school.strip() for primary_school in primary_schools if primary_school.strip()]) # 提取小学列表 - 表头和数据 table_head = section.xpath('.//div[@class="SchoolsAccordionTable__DesktopSchoolFactHeaderWrapper-sc-1w3ewmz-0 bedwwe"]//text()') table_head = list(dict.fromkeys(table_head)) school_items = section.xpath('.//div[contains(@class, "Accordion__Wrapper")]/div') for school in school_items: name = school.xpath('.//div[contains(@class, "SchoolAccordionItemstyled__TitleWrapper")]//text()') name = " ".join([n.strip() for n in name if n.strip()]) primary_schools_info['schools'].append(name) # 中学信息 elif "secondary" in section_title.lower(): # 提取中学描述 secondary_schools = section.xpath('.//p[@class="ContentBrick__SubtitleWithDynamicContent-sc-1sfxg8l-4 ipEFVW"]//text()') if secondary_schools: secondary_schools_info["description"] = ' '.join([secondary_school.strip() for secondary_school in secondary_schools if secondary_school.strip()]) # 提取中学列表 - 表头和数据 table_head = section.xpath('.//div[@class="SchoolsAccordionTable__DesktopSchoolFactHeaderWrapper-sc-1w3ewmz-0 bedwwe"]//text()') table_head = list(dict.fromkeys(table_head)) school_items = section.xpath('.//div[contains(@class, "Accordion__Wrapper")]/div') for school in school_items: school_name = school.xpath('.//div[contains(@class, "SchoolAccordionItemstyled__TitleWrapper")]//text()') school_name = " ".join([n.strip() for n in school_name if n.strip()]) secondary_schools_info["schools"].append(school_name) elif "kinder" in section_title.lower(): center_infos = [] centers = section.xpath('.//div[contains(@class, "Accordion__Wrapper")]/div') for center in centers: # 提取名称 name_elem = center.xpath('.//div[contains(@class, "ChildcareAccordionItemstyled__TitleWrapper")]//text()') # 提取其他字段 info_list = center.xpath('.//div[contains(@class, "ChildcareAccordionItemstyled__DesktopChildcareFactsSubtitle")]/div//text()') info_list = [info for info in info_list if info != ' '] rating = info_list[0] center_infos.append({ "name": name_elem, "rating": "", "reviews": "", "day_rate": "", "distance": "", }) all_sections_info = [] section_lists = A.xpath('//div[@class="PageContainer-sc-1bbzxlg-0 PropertyPageMainContent__PropertyPageMainBodyContentContainer-sc-44j84k-0 eTUAAl kwOMda"]/section') for section in section_lists[1: ]: key_value_lists = section.xpath('./div[@class="ContentBrick__BrickHeader-sc-1sfxg8l-1 iarlJC"]/div[@class="ContentBrick__TitlesWrapper-sc-1sfxg8l-2 kjhcFm"]//text()') other_text = section.xpath('./p[@class="Text__Typography-sc-1103tao-0 bTArky PropertyValuationBrick__Caption-sc-v7nzs2-1 ciEIJc"]//text()') # print(other_text) sepc_list = section.xpath('./div[@class="PropertyValuationBrick__PropertyValuationSubBrickContainer-sc-v7nzs2-0 kRtQPr"]/section') for sepc in sepc_list: small_text = sepc.xpath('./div//text()') # print(small_text) time_list = section.xpath('./div[@class="PropertyTimelineBeforeYouBuy__Container-sc-1orfm55-3 iGrzlx"]/div[@class="PropertyTimelineBeforeYouBuy__Card-sc-1orfm55-4 glseQh"]') for time in time_list: box_text = time.xpath('./div[@class="PropertyTimelineBeforeYouBuy__Card-sc-1orfm55-4 glseQh"]//text()') final_text = time.xpath('./iv[@class="PropertyTimelineBeforeYouBuy__DynamicDisclaimer-sc-1orfm55-8 bArUbh"]//text()') final_text = ''.join(final_text) # print(final_text) # 补充确实的key对应的value span_list = section.xpath('./div[@class="PropertyDescriptionBrickWithData__Description-sc-qqi42z-0 jRcBlF"]//text()') lack_value = '' if span_list: lack_value = ''.join(span.strip() for span in span_list if span.strip()) if 'About the property' in key_value_lists: key_value_lists.append(lack_value) all_sections_info.append(key_value_lists) # for info in all_sections_info: # print(info) def main(): spider() if __name__ == '__main__': main(),这是数据格式:RATING REVIEWS DAY RATE DISTANCE Jack & Jill Kindergarten Mosman Meeting NQS 4.0 (3) 4.0 out of 5 based on 3 user reviews $177 0.05km VACANCIES 7:30am to 6pm 2C Alexander Avenue, Mosman, NSW 2088 Research further Only About Children Mosman Meeting NQS 4.0 (5) 4.0 out of 5 based on 5 user reviews POA 0.33km Shadforth Cottage (Avenue Road) Meeting NQS 5.0 (9) 5.0 out of 5 based on 9 user reviews POA 0.69km Shadforth Cottage and Preschool Meeting NQS 5.0 (9) 5.0 out of 5 based on 9 user reviews POA 0.71km The Lookout Early Education Centre Out of scope 5.0 (23) 5.0 out of 5 based on 23 user reviews $208 0.16km
08-25
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值