使用urllib和beautifulSoup实现爬虫抓取小说网站书名,解决BUG:NoneType object has no attribute 'find_all'

本文介绍如何使用Python的urllib库抓取小说网站的数据,并通过BeautifulSoup进行解析。在解析过程中,特别提到了一个常见错误:当对象为None时调用'find_all'方法会抛出AttributeError。解决方案是确保在调用该方法前对象已正确初始化。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

首先找一个网站,例如

urlHTML='http://www.douban.com/tag/%E5%B0%8F%E8%AF%B4/?focus=book'


接下来利用urllib库抓取数据,保存数据到一个变量中

request_data=urllib.request.urlopen(urlHTML)


用beautifulSoup解析网页语法,并保存结果,注意此处第二个参数不能使用单引号,

否则会出现BUG      NoneType object has no attribute 'find_all'

soup=BeautifulSoup(request_data,"html.parser")



字典定义筛选规则,使用bS库find方法抓取数据
sty={
 
 
import curses import requests from bs4 import BeautifulSoup from urllib.parse import urljoin import os # 文件路径 FAVORITES_FILE = 'favorites.txt' HISTORY_FILE = 'history.txt' # 加载数据 def load_list(filename): try: with open(filename, 'r', encoding='utf-8') as f: return [line.strip() for line in f if line.strip()] except FileNotFoundError: return [] # 保存数据 def save_list(filename, items): with open(filename, 'w', encoding='utf-8') as f: for item in items: f.write(item + '\n') # 获取网页纯文本 def fetch_page(url): try: res = requests.get(url, timeout=5) res.raise_for_status() soup = BeautifulSoup(res.text, 'html.parser') for tag in soup(['script', 'style']): tag.decompose() return soup.get_text() except Exception as e: return f"加载失败: {e}" # 提取超链接表单(扩展版) def extract_links_and_forms(html, base_url): soup = BeautifulSoup(html, 'html.parser') # 提取超链接 links = [] for a in soup.find_all('a', href=True): href = a['href'] if not href.startswith('http'): href = urljoin(base_url, href) links.append((href, a.get_text(strip=True))) # 提取表单 forms = [] for form in soup.find_all('form'): action = form.get('action', '') if not action.startswith('http'): action = urljoin(base_url, action) method = form.get('method', 'get').lower() inputs = [] # 提取所有输入字段 for input_tag in form.find_all(['input', 'textarea', 'select']): name = input_tag.get('name') if not name: continue input_type = input_tag.get('type', 'text') value = input_tag.get('value', '') required = input_tag.has_attr('required') placeholder = input_tag.get('placeholder', '') if input_tag.name == 'textarea': input_type = 'textarea' value = input_tag.get_text() elif input_tag.name == 'select': input_type = 'select' options = [(opt.get('value') or opt.text, opt.text) for opt in input_tag.find_all('option')] inputs.append({ 'name': name, 'type': input_type, 'value': value, 'required': required, 'placeholder': placeholder, 'options': options }) continue # 处理 checkbox radio if input_type == 'checkbox' or input_type == 'radio': inputs.append({ 'name': name, 'type': input_type, 'value': value, 'checked': input_tag.has_attr('checked'), 'required': required }) continue inputs.append({ 'name': name, 'type': input_type, 'value': value, 'required': required, 'placeholder': placeholder }) forms.append((action, method, inputs)) return links, forms # 输入框 def input_box(stdscr, prompt): curses.echo() stdscr.clear() stdscr.addstr(0, 0, prompt) stdscr.refresh() input_str = stdscr.getstr(1, 0).decode('utf-8') curses.noecho() return input_str # 提交表单(扩展版) def submit_form(stdscr, form): action, method, inputs = form data = {} for field in inputs: name = field['name'] input_type = field['type'] required = field.get('required', False) placeholder = field.get('placeholder', '') value = field.get('value', '') prompt = f"{name}" if placeholder: prompt += f"(提示:{placeholder})" if required: prompt += " [必填]" if input_type == 'select': options = field['options'] stdscr.clear() stdscr.addstr(0, 0, f"请选择 {name}:") for i, (val, text) in enumerate(options): stdscr.addstr(i + 1, 0, f"{i + 1}. {text}") stdscr.refresh() idx = int(stdscr.getstr(len(options) + 2, 0).decode('utf-8')) - 1 data[name] = options[idx][0] elif input_type == 'checkbox': checked = field.get('checked', False) res = input_box(stdscr, f"{name} [复选框] 是否选中?(y/n):") data[name] = 'on' if res.lower() == 'y' else '' elif input_type == 'radio': res = input_box(stdscr, f"{name} [单选] 是否选中?(y/n):") data[name] = value if res.lower() == 'y' else '' else: default = value if value else '' user_input = input_box(stdscr, f"{prompt}:") data[name] = user_input if user_input else default try: if method == 'post': res = requests.post(action, data=data) else: res = requests.get(action, params=data) return res.text except Exception as e: return f"表单提交失败:{e}" # 收藏夹菜单 def favorites_menu(stdscr, favorites, current_url): while True: options = ["新添", "删除", "退出"] action = show_list(stdscr, options, "收藏夹", options) if action == 0: # 新添 if current_url not in favorites: favorites.append(current_url) save_list(FAVORITES_FILE, favorites) elif action == 1: # 删除 if favorites: idx = show_list(stdscr, favorites, "选择要删除的收藏") if idx >= 0: favorites.pop(idx) save_list(FAVORITES_FILE, favorites) elif action == 2 or action == -1: break # 历史记录菜单 def history_menu(stdscr, history): while True: options = ["清空", "退出"] action = show_list(stdscr, options, "历史记录", options) if action == 0: # 清空 history.clear() save_list(HISTORY_FILE, history) elif action == 1 or action == -1: break elif action >= 0: return history[action] return None # 显示列表(收藏夹/历史记录) def show_list(stdscr, items, title, actions=None): selected = 0 while True: stdscr.clear() stdscr.addstr(0, 0, title) if actions: stdscr.addstr(0, len(title) + 2, f"| {' | '.join(actions)}") stdscr.addstr(1, 0, "-" * 50) for i, item in enumerate(items): if i == selected: stdscr.attron(curses.A_REVERSE) stdscr.addstr(i + 2, 0, f"{i + 1}. {item}") if i == selected: stdscr.attroff(curses.A_REVERSE) stdscr.addstr(len(items) + 3, 0, "方向键选择,Enter确认,q退出") stdscr.refresh() key = stdscr.getch() if key == curses.KEY_UP and selected > 0: selected -= 1 elif key == curses.KEY_DOWN and selected < len(items) - 1: selected += 1 elif key == ord('\n'): return selected elif key == ord('q'): return -1 def main(stdscr): curses.curs_set(0) # 隐藏光标 favorites = load_list(FAVORITES_FILE) history = load_list(HISTORY_FILE) # 初始页面加载 current_url = input_box(stdscr, "请输入网址:") page_text = fetch_page(current_url) if current_url not in history: history.append(current_url) if len(history) > 20: history.pop(0) save_list(HISTORY_FILE, history) # 初始数据 menu_options = ["收藏夹", "新的网页", "历史记录"] selected_menu = 0 links, forms = extract_links_and_forms(page_text, current_url) selected_link = 0 selected_form = 0 mode = "menu" # 当前模式:menu, link, form while True: stdscr.clear() h, w = stdscr.getmaxyx() # 显示顶部菜单 for i, opt in enumerate(menu_options): x = 2 + i * 15 if mode == "menu" and i == selected_menu: stdscr.attron(curses.A_REVERSE) stdscr.addstr(0, x, opt) if mode == "menu" and i == selected_menu: stdscr.attroff(curses.A_REVERSE) stdscr.addstr(1, 0, "-" * w) # 显示网页内容 lines = page_text.split('\n') for i, line in enumerate(lines[:h - 10]): stdscr.addstr(i + 2, 0, line[:w - 1]) # 显示超链接 stdscr.addstr(h - 8, 0, "超链接:") for i, (url, text) in enumerate(links): label = f"{i + 1}. {text[:30]}..." if mode == "link" and i == selected_link: stdscr.attron(curses.A_REVERSE) stdscr.addstr(h - 7 + i, 0, label[:w - 1]) if mode == "link" and i == selected_link: stdscr.attroff(curses.A_REVERSE) # 显示表单 stdscr.addstr(h - 7 + len(links) + 1, 0, "表单:") for i, (action, method, inputs) in enumerate(forms): label = f"表单 {i + 1}: {method.upper()} {action[:30]}..." if mode == "form" and i == selected_form: stdscr.attron(curses.A_REVERSE) stdscr.addstr(h - 6 + len(links) + i + 1, 0, label[:w - 1]) if mode == "form" and i == selected_form: stdscr.attroff(curses.A_REVERSE) # 底部提示 stdscr.addstr(h - 1, 0, "方向键选择,Enter确认,q返回,Tab切换区域") stdscr.refresh() # 用户输入处理 key = stdscr.getch() if key == ord('q'): break elif key == ord('\t'): # 切换模式:菜单 -> 链接 -> 表单 -> 菜单 if mode == "menu": mode = "link" elif mode == "link": mode = "form" else: mode = "menu" elif key == curses.KEY_UP: if mode == "menu" and selected_menu > 0: selected_menu -= 1 elif mode == "link" and selected_link > 0: selected_link -= 1 elif mode == "form" and selected_form > 0: selected_form -= 1 elif key == curses.KEY_DOWN: if mode == "menu" and selected_menu < len(menu_options) - 1: selected_menu += 1 elif mode == "link" and selected_link < len(links) - 1: selected_link += 1 elif mode == "form" and selected_form < len(forms) - 1: selected_form += 1 elif key == ord('\n'): if mode == "menu": if selected_menu == 0: favorites_menu(stdscr, favorites, current_url) elif selected_menu == 1: current_url = input_box(stdscr, "请输入网址:") page_text = fetch_page(current_url) if current_url not in history: history.append(current_url) if len(history) > 20: history.pop(0) save_list(HISTORY_FILE, history) links, forms = extract_links_and_forms(page_text, current_url) elif selected_menu == 2: selected_url = history_menu(stdscr, history) if selected_url: current_url = selected_url page_text = fetch_page(current_url) links, forms = extract_links_and_forms(page_text, current_url) elif mode == "link": if links: current_url = links[selected_link][0] page_text = fetch_page(current_url) links, forms = extract_links_and_forms(page_text, current_url) elif mode == "form": if forms: result = submit_form(stdscr, forms[selected_form]) page_text = result links, forms = extract_links_and_forms(page_text, current_url) return # 启动程序 if __name__ == "__main__": curses.wrapper(main) 这是我的纯文本浏览器的源代码。它报错了:Python 3.11.9 (tags/v3.11.9:de54cf5, Apr 2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)] on win32 Type "help", "copyright", "credits" or "license()" for more information. = RESTART: C:\Users\number one\Desktop\纯文本浏览器.py Traceback (most recent call last): File "C:\Users\number one\Desktop\纯文本浏览器.py", line 363, in <module> curses.wrapper(main) File "C:\Users\number one\AppData\Local\Programs\Python\Python311\Lib\curses\__init__.py", line 73, in wrapper stdscr = initscr() File "C:\Users\number one\AppData\Local\Programs\Python\Python311\Lib\curses\__init__.py", line 30, in initscr fd=_sys.__stdout__.fileno()) AttributeError: 'NoneType' object has no attribute 'fileno'
最新发布
07-12
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值