【Python——爬取当当网书籍相关】

随缘859

已于 2024-09-24 09:03:31 修改

阅读量594

点赞数 3

文章标签： python 爬虫

于 2024-09-24 09:03:08 首次发布

本文链接：https://blog.youkuaiyun.com/weixin_59638462/article/details/142478883

版权

import re
import requests
import time
from threading import Thread


headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Referer": "https://category.dangdang.com/cp01.05.12.00.00.00.html",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
    "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Google Chrome\";v=\"128\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\""
}
cookies = {
    "ddscreen": "2",
    "__permanent_id": "20240921142852592769874379858564552",
    "dest_area": "country_id%3D9000%26province_id%3D111%26city_id%3D0%26district_id%3D0%26town_id%3D0",
    "__visit_id": "20240921153554167392616984617455899",
    "__out_refer": "",
    "__rpm": "s_605253.4516808.0.1726909350202%7Cs_605253.4516873.6.1726909477624",
    "search_passback": "a114a38296e6f685248cee660000000042a7cb001d8cee66",
    "__trace_id": "20240921170439237375191289874994710"
}
url = "https://category.dangdang.com/cp01.05.00.00.00.00.html"
response = requests.get(url=url)
content = response.content.decode('gbk')
# print(chardet.detect(response.content)['encoding'])
# print(content)
book_types = re.findall(' <span rel=""><a href="/(.*?).html"   title="(.*?)"><em>', content)


def task(n):
    # print(book_types, len(book_types))
    book_urls_li = []
    book_type_li = []
    for book_type in book_types:
        book_urls_li.append(book_type[0])
        book_type_li.append(book_type[1])
    # print(book_urls_li)
    min = 799.5
    max = 800
    for book_url in book_urls_li:
        print(book_type_li[n])
        while True:
            if min >= 0.5:
                for page in range(1, 101):
                    # print(page)
                    if page >= 2:
                        response_book = requests.get(
                            url='https://category.dangdang.com/' + f'pg{page}-' + book_urls_li[
                                n] + f'-lp{min}-hp{max}.html',
                            headers=headers, cookies=cookies)
                    else:
                        response_book = requests.get(
                            url='https://category.dangdang.com/' + book_urls_li[n] + f'-lp{min}-hp{max}.html',
                            headers=headers,
                            cookies=cookies)
                    content_book = response_book.content.decode('gbk')
                    # print(content_book)
                    datas = re.findall(' <a title=" (.*?)"  ddclick=', content_book)
                    if datas == []:
                        break
                    print(datas)
                min -= 0.5
                max -= 0.5
            else:
                break


if __name__ == '__main__':
    start_time = time.time()
    t_list = []
    # 异步编程 (不按顺序 谁先执行完 谁返回程序继续往后走)
    for i in range(15):
        t = Thread(target=task, args=(i,))
        t.start()
        t_list.append(t)
    for t in t_list:
        t.join()
    print("cost time: ", time.time() - start_time)