C Recommend Book List

博客推荐了一系列C语言相关书籍。包括适合初学者的《C Programming: A Modern Approach》等,参考书籍如《C : A Reference Manual》,还有涉及高级主题的《C Traps and Pitfalls》等,涵盖了从基础入门到深入探索的不同阶段。

GENERAL INTRODUCTION/TUTORIAL:

(1) For real beginners looking for a solid introduction:

C Programming: A Modern Approach.
K.N.King.
W.W.Norton & Company, 1996.
ISBN 0-393-96945-2


(2) For somewhat more experienced users looking for a solid introduction:

The C Programming Language, 2nd Ed.
Kernigan & Ritchie.
Prentice Hall, 1988.
ISBN 0-13-110362-8


(3) Other recommended introductory books:


C: How to Program, 2nd Ed.
Deitel, H.M. & Deitel, P.J.
Prentice Hall, 1994.
ISBN: 0-13-226119-7


Reference Books:


C : A Reference Manual, 4th Ed.
Harbison & Steele.
Prentice Hall, 1995.
ISBN 0-13-326224-3


The Standard C Library.
P.J.Plauger.
Prentice Hall, 1992.
ISBN 0-13-131509-9


C Programming FAQs
Steve Summit
Addison-Wesley, 1996.
ISBN 0-201-84519-9


Advanced topics / further exploration

C Traps and Pitfalls.
Andrew Koenig.
Addison-Wesley, 1989.
ISBN 0-201-17928-8


Expert C Programming: Deep C Secrets
Peter Van Der Linden
Prentice Hall, 1994.
ISBN 0-13-177429-8


Practical C Programming.
Steve Oualline.
O'Reilly & Associates, 1993.
ISBN 1-56592-035-X


Problem Solving And Program Design In C, 2nd Ed.
Hanly & Koffman.
Addison-Wesley, 1996.
ISBN 0-201-59063-8


Algorithms in C, 3rd Ed.
Robert Sedgewick
Addison-Wesley, 1998.
ISBN 0-201-31452-5

C Unleashed.
Heathfield, Kirby et al
Sams Publishing, 2000.
ISBN 0-672-31896-2

import time import requests as req from bs4 import BeautifulSoup # 常量 DATA_EID = 'data-eid' DATA_BID = 'data-bid' PAGE = 'page' HREF = 'href' # 发送请求 def senRequest(protocol='http://',url='www.qidian.com'): base_url = protocol + url headers = { 'content-type': 'application/json;charset=utf-8', 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.37', } html = req.get(base_url,headers=headers) return BeautifulSoup(html.text) def getMonthlyTicket(soup): """ 获取月票榜数据 :param soup: :return: """ print(soup) def getCoverRecommend(url): """ 获取封面推荐图书信息 :param url: 爬取地点 :return: None """ soup = senRequest(protocol='http://',url=url) pagination_list = soup.select("ul[class='lbf-pagination-item-list'] > li[class='lbf-pagination-item'] > a") last_page = pagination_list[len(pagination_list) - 1] total_page = int(last_page.getText()) page_str = 'page' for page in range(1,total_page): page_url = page_str + str(page) request_url = url + page_url print("共"+str(total_page)+"页,正在爬取第"+str(page)+"页...",request_url) single_page_soup = senRequest(url=request_url) getSinglePageCoverRec(single_page_soup) # 暂停一秒再请求下一页 time.sleep(1) # 只看第一页,生产时注释掉 break def getSinglePageCoverRec(soup): """ 解析一页的封面推荐小说 :param soup: 某一页的 bs4 对象 :return: None """ li_list = soup.select("ul[class='cf'] > li") book_list = [] for li in li_list: # 爬取封面图片 tag_img = li.select_one("li > div[class='focus-img'] > a > img") img_url = tag_img.get('src') img_url = img_url.replace("//",'') # 爬取图书信息 tag_date = li.select_one("li > div[class='info'] > span") date_time = tag_date.getText() tag_a = li.select_one("li > div[class='info'] a") book_url = tag_a.get('href') book_name = tag_a.getText() # 将数据转为元组,方便序列化 book_info = (date_time,book_url, book_name,img_url) book_list.append(book_info) print(book_info) print(date_time,book_url, book_name,img_url) # 测试时,只看第一条,正常生产时可以放开 break # 返回值,需要时打开,这里我只是看一下就不用返回值了 # return tuple1 print(book_list) def getCategory(soup): """ 解析分类列表 :param soup: BeautifulSoup对象 :return: None """ classify_list = soup.select_one('#classify-list') a_list = classify_list.select('a') for aa in a_list: # url url = aa.get('href') # 分类id category_id = aa.get('data-eid') # 名称 name = aa.get('title') # 分类 category = aa.select_one('i').getText() # 阅读量 read_number = aa.select_one('b').getText() print(url,name,category_id,category,read_number) def getStrongRecommend(strong_recommend_url): """ 解析强推页面 :param strong_recommend_url: 强推页面url :return: None """ soup = senRequest(url=strong_recommend_url) page_container = soup.select_one("#page-container") page_data_pagemax = page_container.get('data-pagemax') total_page = int(page_data_pagemax) for page in range(1,total_page): page_url = PAGE + str(page) req_url = strong_recommend_url + page_url print("共" + str(total_page) + "页,正在爬取第" + str(page) + "页...", req_url) page_soup = senRequest(url=req_url) getSinglePageStrongRec(page_soup) # 间隔一秒,请求下一页,请求太频繁实在太缺德了 time.sleep(1) break def getSanjang(sanjang_url): """ 解析三江图书的方法。三江页面和往期强推页面基本一致所以用的同一个方法 :param sanjang_url: 三江页面的url :return: None """ getStrongRecommend(sanjang_url) def getSinglePageStrongRec(soup): tag_li_list = soup.select("li[class='strongrec-list book-list-wrap']") book_list = [] for li in tag_li_list: # 时间范围 tag_h3_from = li.select_one("span[class='date-from']") date_from = tag_h3_from.getText().strip() tag_h3_to = li.select_one("span[class='date-to']") date_to = tag_h3_to.getText().strip() li_book_list = li.select("div[class='book-list'] li") for li_book in li_book_list: # 解析栏目信息 tag_a_channel = li_book.select_one("a[class='channel']") channel_data_eid = tag_a_channel.get(DATA_EID) channel_url = tag_a_channel.get(HREF).replace('//','') channel_name = tag_a_channel.getText() # print(channel_data_eid, channel_url, channel_name) # 解析图书信息 tag_a_book = li_book.select_one("a[class='name']") book_id = tag_a_book.get(DATA_BID) book_url = tag_a_book.get(HREF).replace('//','') book_name = tag_a_book.getText() # print(book_id, book_url, book_name) # 解析作者信息 tag_a_author = li_book.select_one("a[class='author']") if tag_a_author is not None: author_data_eid = tag_a_author.get(DATA_EID) author_url = tag_a_author.get(HREF).replace('//', '') author_name = tag_a_author.getText() else: tag_a_author = li_book.select_one("span[class='rec']") author_name = tag_a_author.getText() author_url = '' author_data_eid = '' # 转为元组,方便序列化 book_info = (date_from,date_to,channel_data_eid, channel_url, channel_name,book_id, book_url, book_name,author_data_eid, author_url, author_name) book_list.append(book_info) print(book_info) break print(book_list) def analysis(): """ 解析方法 :return: """ # 获取封面推荐图书 cover_recommend_url = "www.qidian.com/book/coverrec/" getCoverRecommend(url=cover_recommend_url) # 获取强荐图书 strong_recommend_url = "www.qidian.com/book/strongrec/" getStrongRecommend(strong_recommend_url=strong_recommend_url) # 获取三江 sanjang_url = "www.qidian.com/book/sanjiang/" getSanjang(sanjang_url=sanjang_url) if __name__ == '__main__': analysis()
最新发布
11-25
我现在要把这两个代码合并到一起 这是第一个 import requests from bs4 import BeautifulSoup import csv from tqdm import tqdm # 进度条 def get_page(url, headers): response = requests.get(url, headers=headers) response.encoding = "gbk" html = response.text return html def get_data(html): soup = BeautifulSoup(html, "html.parser") ul = soup.find("ul", class_="bang_list clearfix bang_list_mode") li_list = ul.find_all("li") for li in li_list: try: rank = li.find("div", class_="list_num red").get_text().strip(".") # 排名 except: rank = li.find("div", class_="list_num").get_text().strip(".") # 排名 # 获取书籍ID try: book_id = li.find("div", class_="name").a["href"].split('/')[-1].split('.')[0] except: book_id = "未知ID" bookName = li.find("div", class_="name").a["title"] # 书名 content = li.find("div", class_="star").a.get_text().strip("条评论") # 评论 recommend = li.find("span", class_="tuijian").get_text().strip("推荐") # 推荐 div_list = li.find_all("div", class_="publisher_info") try: author = div_list[0].a["title"] # 作者 except: author = None publishTime = div_list[1].span.get_text() # 出版日期 press = div_list[1].a.get_text() # 出版社 originalPrice = li.find("span", class_="price_r").get_text().strip("¥") # 原价 nowPrice = li.find("span", class_="price_n").get_text().strip("¥") # 现价 discount = li.find("span", class_="price_s").get_text().strip("折") # 打折 data = [rank, book_id, bookName, author, publishTime, press, originalPrice, nowPrice, discount, content, recommend] with open("data.csv", "a", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(data) if __name__ == "__main__": headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78" } # 写入表头(增加书籍ID列) with open("data.csv", "w", encoding="utf-8", newline="") as f: head = ["排名", "书籍ID", "书名", "作者", "出版时间", "出版社", "原价", "现价", "打折", "评论", "推荐"] writer = csv.writer(f) writer.writerow(head) # 爬取25页数据 for i in tqdm(range(1, 26)): url = f"http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent30-0-0-1-{i}" html = get_page(url, headers) get_data(html) print("数据爬取完成!!!") 这是第二个 import requests from jsonpath_ng import parse import re import csv import os def get_top_10_book_ids(): """从CSV文件中读取前10个书籍ID""" book_ids = [] try: with open("data.csv", "r", encoding="utf-8") as f: reader = csv.reader(f) next(reader) # 跳过表头 for i, row in enumerate(reader): if i >= 10: break book_id = row[1] # 书籍ID在第二列 if book_id and book_id != "未知ID": book_ids.append(book_id) except Exception as e: print(f"读取书籍ID失败: {e}") return book_ids def crawl_and_save_to_csv(pages, book_id): """爬取指定书籍ID的评论并保存到CSV文件""" # 创建以书籍ID命名的CSV文件 csv_file = f'当当网商品_{book_id}_评价.csv' if os.path.exists(csv_file): os.remove(csv_file) # 创建CSV文件并写入表头 with open(csv_file, 'w', encoding='utf-8-sig', newline='') as f: writer = csv.DictWriter(f, fieldnames=['用户名', '评论内容']) writer.writeheader() for i in range(pages): page = i + 1 # 使用传入的书籍ID构建URL url = f'http://product.dangdang.com/index.php?r=comment%2Flist&productId={book_id}&categoryPath=58.65.03.03.00.00&mainProductId={book_id}&mediumId=21&pageIndex={page}&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0&template=cloth' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36', # 使用传入的书籍ID构建Referer 'Referer': f'http://product.dangdang.com/{book_id}.html', 'Cookie': 'from=460-5-biaoti; order_follow_source=P-460-5-bi%7C%231%7C%23www.baidu.com%252Fother.php%253Fsc.060000jRtGgkBd47ECAxHUxBlqwLkfBJsl8lSLtmm9Zl27Qa_kZyOm2Qg_lyRgkRd4vKD9uWt%7C%230-%7C-; ddscreen=2; __permanent_id=20210304204636997189494350346254347; __visit_id=20210304204637001245338343220621735; __out_refer=1614861997%7C!%7Cwww.baidu.com%7C!%7C%25E5%25BD%2593%25E5%25BD%2591%25E7%25BD%2591; __ddc_15d_f=1614861997%7C!%7C_utm_brand_id%3D11106; dest_area=country_id%3D9000%26province_id%3D111%26city_id%3D0%26district_id%3D0%26town_id%3D0; pos_0_end=1614862009963; __ddc_1d=1614862062%7C!%7C_utm_brand_id%3D11106; __ddc_24h=1614862062%7C!%7C_utm_brand_id%3D11106; __ddc_15d=1614862062%7C!%7C_utm_brand_id%3D11106; pos_9_end=1614862078563; ad_ids=4343831%2C3554365%7C%233%2C2; secret_key=f097eea219c17c155499399cb471dd5a; pos_1_start=1614863547245; pos_1_end=1614863547264; __rpm=%7Cp_{book_id}.029..1614863548625; __trace_id=20210304211706253212636290464425201' } try: response = requests.get(url, headers=headers) response.raise_for_status() py_data = response.json() except Exception as e: print(f"第 {page} 页请求失败: {e}") continue try: jsonpath_expr = parse('$..html') matches = [match.value for match in jsonpath_expr.find(py_data)] if matches: html_data = matches[0] else: print(f"第 {page} 页未找到 HTML 数据") continue except Exception as e: print(f"第 {page} 页数据解析失败: {e}") continue comment_list = re.findall(r'<div class="describe_detail">\s*<span>(.*?)</span>\s*</div>', html_data) nickname_list = re.findall(r'alt="(.*?)"', html_data) # 确保昵称和评论数量匹配 min_length = min(len(comment_list), len(nickname_list)) # 直接写入CSV文件 with open(csv_file, 'a', encoding='utf-8-sig', newline='') as f: writer = csv.DictWriter(f, fieldnames=['用户名', '评论内容']) for i in range(min_length): if nickname_list[i].strip() and comment_list[i].strip(): writer.writerow({ '用户名': nickname_list[i], '评论内容': comment_list[i] }) print(f"书籍ID {book_id} 的第 {page} 页完成,添加了 {min_length} 条评论") if __name__ == '__main__': pages = int(input('请输入要爬取的页数:')) # 获取前10个书籍ID book_ids = get_top_10_book_ids() if not book_ids: print("未找到有效的书籍ID,请确保data.csv文件存在且包含书籍ID数据。") else: print(f"准备爬取以下书籍的评论:{book_ids}") for book_id in book_ids: print(f"\n开始爬取书籍ID {book_id} 的评论...") crawl_and_save_to_csv(pages, book_id) print("\n所有书籍评论爬取完成,数据已分别保存到对应的CSV文件中。")
06-10
可以在原有基础上 再简化一下这个代码吗 不要修改相关代码的名称之类的 哦可以删除多余的注释部分 保留关键的环节点就行 import requests # 导入requests库,用于发送HTTP请求 from bs4 import BeautifulSoup # 导入BeautifulSoup库,用于解析HTML页面 import csv # 导入csv库,用于处理CSV文件 from tqdm import tqdm # 导入tqdm库,用于显示进度条 import re # 导入re库,用于正则表达式操作 import os # 导入os库,用于操作系统相关功能,如文件和目录操作 import random # 导入random库,用于生成随机数 import time # 导入time库,用于处理时间相关操作 from jsonpath_ng import parse # 导入jsonpath_ng库的parse函数,用于解析JSONPath表达式 # 定义一个函数,用于发送HTTP请求获取页面内容 def get_page(url, headers): try: # 发送GET请求到指定的URL,并携带请求头信息 response = requests.get(url, headers=headers) # 设置响应的编码为gbk,因为当当网使用gbk编码 response.encoding = "gbk" # 检查请求是否成功,如果失败会抛出异常 response.raise_for_status() # 返回响应的文本内容 return response.text except requests.exceptions.RequestException as e: # 如果请求过程中出现异常,打印错误信息 print(f"请求失败: {e}") # 返回空字符串 return "" # 定义一个函数,用于解析书籍榜单页面,提取书籍信息并保存到CSV def get_book_list_data(html): book_ids = [] # 初始化一个空列表,用于存储提取到的书籍ID # 使用BeautifulSoup解析HTML内容 soup = BeautifulSoup(html, "html.parser") # 查找书籍列表的ul元素 ul = soup.find("ul", class_="bang_list clearfix bang_list_mode") if not ul: # 如果未找到书籍列表的ul元素,打印提示信息 print("未找到书籍列表,请检查页面结构是否变化") # 返回空列表 return book_ids # 遍历所有li元素(每本书) li_list = ul.find_all("li") for li in li_list: # 对li_list列表中的每个li元素进行遍历 try: # 提取排名信息 rank_div = li.find("div", class_="list_num red") or li.find("div", class_="list_num") rank = rank_div.get_text().strip(".") # 获取排名文本并去除末尾的点号 # 提取书籍ID book_link = li.find("div", class_="name").a["href"] book_id = book_link.split('/')[-1].split('.')[0] # 从链接中提取书籍ID # 提取书名 book_name = li.find("div", class_="name").a["title"] # 提取评论数量 content = li.find("div", class_="star").a.get_text().strip("条评论") # 提取推荐指数 recommend = li.find("span", class_="tuijian").get_text().strip("推荐") # 提取作者、出版社和出版时间 publisher_info = li.find_all("div", class_="publisher_info") author = publisher_info[0].a["title"] if publisher_info[0].a else None # 如果有作者信息则提取,否则为None publish_time = publisher_info[1].span.get_text() press = publisher_info[1].a.get_text() # 提取价格信息 original_price = li.find("span", class_="price_r").get_text().strip("¥") now_price = li.find("span", class_="price_n").get_text().strip("¥") discount = li.find("span", class_="price_s").get_text().strip("折") # 组织数据 data = [rank, book_id, book_name, author, publish_time, press, original_price, now_price, discount, content, recommend] # 写入CSV文件 with open("data.csv", "a", encoding="utf-8", newline="") as csv_file: csv_writer = csv.writer(csv_file) # 修改变量名 csv_writer.writerow(data) # 将数据写入CSV文件的一行 # 收集有效书籍ID if book_id != "未知ID": book_ids.append(book_id) # 如果书籍ID有效,则添加到book_ids列表中 except Exception as e: # 如果解析书籍信息时出现异常,打印错误信息 print(f"解析书籍信息时出错: {e}") continue # 跳过当前循环,继续处理下一个li元素 return book_ids # 返回提取到的书籍ID列表 # 定义一个函数,用于从CSV文件中读取前N个书籍ID def get_top_book_ids(limit=10): book_ids = [] # 初始化一个空列表,用于存储书籍ID try: with open("data.csv", "r", encoding="utf-8") as data_file: reader = csv.reader(data_file) next(reader) # 跳过CSV文件的表头 for i, row in enumerate(reader): # 遍历CSV文件的每一行 if i >= limit: break # 如果已经获取了指定数量的书籍ID,跳出循环 book_id = row[1] if book_id and book_id != "未知ID": book_ids.append(book_id) # 如果书籍ID有效,则添加到book_ids列表中 except Exception as e: # 如果读取书籍ID时出现异常,打印错误信息 print(f"读取书籍ID失败: {e}") return book_ids # 返回书籍ID列表 # 定义一个函数,用于爬取指定书籍的评论并保存到CSV文件 def crawl_book_reviews(pages, book_id, reviews_dir): csv_file = os.path.join(reviews_dir, f'当当网商品_{book_id}_评价.csv') # 构建评论文件的完整路径 # 创建CSV文件并写入表头 with open(csv_file, 'w', encoding='utf-8-sig', newline='') as review_file: c_writer = csv.DictWriter(review_file, fieldnames=['用户名', '评论内容']) c_writer.writeheader() # 写入CSV文件的表头 for i in range(pages): page = i + 1 # 从第1页到指定的页数进行遍历 # 使用传入的书籍ID构建URL url = f'http://product.dangdang.com/index.php?r=comment%2Flist&productId={book_id}&categoryPath=58.65.03.03.00.00&mainProductId={book_id}&mediumId=21&pageIndex={page}&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0&template=cloth' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36', 'Referer': f'http://product.dangdang.com/{book_id}.html', 'Cookie': 'from=460-5-biaoti; order_follow_source=P-460-5-bi%7C%231%7C%23www.baidu.com%252Fother.php%253Fsc.060000jRtGgkBd47ECAxHUxBlqwLkfBJsl8lSLtmm9Zl27Qa_kZyOm2Qg_lyRgkRd4vKD9uWt%7C%230-%7C-; ddscreen=2; __permanent_id=20210304204636997189494350346254347; __visit_id=20210304204637001245338343220621735; __out_refer=1614861997%7C!%7Cwww.baidu.com%7C!%7C%25E5%25BD%2593%25E5%25BD%2591%25E7%25BD%2591; __ddc_15d_f=1614861997%7C!%7C000000000000000000000000000000000000000000000000000000000000000_utm_brand_id%3D11106; dest_area=country_id%3D9000%26province_id%3D111%26city_id%3D0%26district_id%3D0%26town_id%3D0; pos_0_end=1614862009963; __ddc_1d=1614862062%7C!%7C_utm_brand_id%3D11106; __ddc_24h=1614862062%7C!%7C_utm_brand_id%3D11106; __ddc_15d=1614862062%7C!%7C_utm_brand_id%3D11106; pos_9_end=1614862078563; ad_ids=4343831%2C3554365%7C%233%2C2; secret_key=f097eea219c17c155499399cb471dd5a; pos_1_start=1614863547245; pos_1_end=1614863547264; __rpm=%7Cp_23665180.029..1614863548625; __trace_id=20210304211706253212636290464425201' } try: # 发送请求获取JSON数据 response = requests.get(url, headers=headers) response.raise_for_status() # 检查请求是否成功 py_data = response.json() # 将响应内容解析为JSON格式 except Exception as e: print(f"第 {page} 页请求失败: {e}") continue try: # 使用jsonpath提取HTML内容 jsonpath_expr = parse('$..html') matches = [match.value for match in jsonpath_expr.find(py_data)] # 使用列表推导式提取匹配的HTML内容 if matches: html_data = matches[0] else: print(f"第 {page} 页未找到 HTML 数据") # 如果未找到HTML数据,打印提示信息 continue # 使用正则表达式提取评论信息 comments = re.findall(r'<div class="describe_detail">\s*<span>(.*?)</span>\s*</div>', html_data) nicknames = re.findall(r'alt="(.*?)"', html_data) # 确保昵称和评论数量匹配 min_length = min(len(comments), len(nicknames)) # 写入评论数据 if min_length > 0: # 确保有有效数据 with open(csv_file, 'a', encoding='utf-8-sig', newline='') as pinglun_f: pinglun_writer = csv.DictWriter(pinglun_f, fieldnames=['用户名', '评论内容']) for idx in range(min_length): if nicknames[idx].strip() and comments[idx].strip(): pinglun_writer.writerow({ '用户名': nicknames[idx], '评论内容': comments[idx] }) print(f"书籍ID {book_id} 的第 {page} 页完成,添加了 {min_length} 条评论") else: print(f"书籍ID {book_id} 的第 {page} 页没有找到任何评论") time.sleep(random.uniform(0.8, 2.6)) # 随机延迟避免被封 except Exception as e: # 如果处理页面时出现异常,打印错误信息 print(f"第 {page} 页处理失败: {e}") continue # 跳过当前循环,继续处理下一页 # 定义一个函数,用于创建评论文件存储目录 def create_reviews_directory(): reviews_dir = "book_reviews" # 定义评论文件存储目录的名称 try: if not os.path.exists(reviews_dir): os.makedirs(reviews_dir) # 如果目录不存在,则创建目录 print(f"已创建评论文件夹: {reviews_dir}") return reviews_dir # 返回目录路径 except Exception as e: # 如果创建目录时出现异常,打印错误信息 print(f"创建文件夹失败: {e}") return None # 返回None # 定义一个函数,用于显示主菜单并处理用户选择 def main_menu(): while True: # 进入无限循环,直到用户选择退出 print("\n=== 当当网书籍数据分析系统 ===") print("1. 爬取书籍榜单数据") print("2. 退出系统") choice = input("请选择操作(1-2): ") # 获取用户输入的选择 if choice == "1": # 爬取书籍榜单数据 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78" } # 创建并写入榜单数据CSV表头 with open("data.csv", "w", encoding="utf-8", newline="") as bangdan_f: head = ["排名", "书籍ID", "书名", "作者", "出版时间", "出版社", "原价", "现价", "打折", "评论", "推荐"] bangdan_writer = csv.writer(bangdan_f) bangdan_writer.writerow(head) # 写入CSV文件的表头 # 创建评论文件存储目录 reviews_dir = create_reviews_directory() if not reviews_dir: # 如果无法创建评论文件夹,打印提示信息并继续循环 print("无法创建评论文件夹,程序终止") continue # 爬取书籍榜单数据 print("开始爬取书籍榜单数据...") all_book_ids = [] # 初始化一个空列表,用于存储所有书籍ID for i in tqdm(range(1, 26)): # 从第1页到第25页进行遍历 url = f"http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent30-0-0-1-{i}" html = get_page(url, headers) # 调用get_page函数获取页面内容 if html: book_ids = get_book_list_data(html) # 调用get_book_list_data函数提取书籍信息 all_book_ids.extend(book_ids) # 将提取到的书籍ID添加到all_book_ids列表中 time.sleep(random.uniform(0.8, 2.6)) # 随机延迟 print(f"书籍榜单数据爬取完成,共获取 {len(all_book_ids)} 本书籍信息") # 获取前10本热门书籍ID top_book_ids = get_top_book_ids(10) # 调用get_top_book_ids函数获取前10本热门书籍ID if not top_book_ids: # 如果未找到有效的书籍ID,打印提示信息 print("未找到有效的书籍ID,请检查data.csv文件") else: print(f"准备爬取以下书籍的评论:{top_book_ids}") pages = int(input('请输入要爬取的评论页数(每本书): ')) # 获取用户输入的评论页数 # 爬取评论数据 for book_id in top_book_ids: # 遍历前10本热门书籍ID print(f"\n开始爬取书籍ID {book_id} 的评论...") crawl_book_reviews(pages, book_id, reviews_dir) # 调用crawl_book_reviews函数爬取评论数据 print("\n所有书籍评论爬取完成!") print("榜单数据已保存到 data.csv") print(f"各书籍评论已分别保存到 {reviews_dir} 文件夹中") elif choice == "2": print("感谢使用,再见!") break else: print("无效的选择,请重新输入") # 如果用户输入的选择无效,打印提示信息 if __name__ == "__main__": #检查脚本是否作为主程序运行 # 初始化 CSV 文件:如果 data.csv 文件不存在,则创建该文件并写入表头 if not os.path.exists("data.csv"): with open("data.csv", "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(["排名", "书籍ID", "书名", "作者", "出版时间", "出版社", "原价", "现价", "打折", "评论", "推荐"]) # 写入CSV文件的表头 create_reviews_directory() # 调用create_reviews_directory函数创建评论文件存储目录 # 启动主菜单 main_menu() # 调用main_menu函数显示主菜单并处理用户选择
06-12
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值