import re import requests import time from threading import Thread headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Referer": "https://category.dangdang.com/cp01.05.12.00.00.00.html", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36", "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\"" } cookies = { "ddscreen": "2", "__permanent_id": "20240921142852592769874379858564552", "dest_area": "country_id%3D9000%26province_id%3D111%26city_id%3D0%26district_id%3D0%26town_id%3D0", "__visit_id": "20240921153554167392616984617455899", "__out_refer": "", "__rpm": "s_605253.4516808.0.1726909350202%7Cs_605253.4516873.6.1726909477624", "search_passback": "a114a38296e6f685248cee660000000042a7cb001d8cee66", "__trace_id": "20240921170439237375191289874994710" } url = "https://category.dangdang.com/cp01.05.00.00.00.00.html" response = requests.get(url=url) content = response.content.decode('gbk') # print(chardet.detect(response.content)['encoding']) # print(content) book_types = re.findall(' <span rel=""><a href="/(.*?).html" title="(.*?)"><em>', content) def task(n): # print(book_types, len(book_types)) book_urls_li = [] book_type_li = [] for book_type in book_types: book_urls_li.append(book_type[0]) book_type_li.append(book_type[1]) # print(book_urls_li) min = 799.5 max = 800 for book_url in book_urls_li: print(book_type_li[n]) while True: if min >= 0.5: for page in range(1, 101): # print(page) if page >= 2: response_book = requests.get( url='https://category.dangdang.com/' + f'pg{page}-' + book_urls_li[ n] + f'-lp{min}-hp{max}.html', headers=headers, cookies=cookies) else: response_book = requests.get( url='https://category.dangdang.com/' + book_urls_li[n] + f'-lp{min}-hp{max}.html', headers=headers, cookies=cookies) content_book = response_book.content.decode('gbk') # print(content_book) datas = re.findall(' <a title=" (.*?)" ddclick=', content_book) if datas == []: break print(datas) min -= 0.5 max -= 0.5 else: break if __name__ == '__main__': start_time = time.time() t_list = [] # 异步编程 (不按顺序 谁先执行完 谁返回程序继续往后走) for i in range(15): t = Thread(target=task, args=(i,)) t.start() t_list.append(t) for t in t_list: t.join() print("cost time: ", time.time() - start_time)
【Python——爬取当当网书籍相关】
于 2024-09-24 09:03:08 首次发布