def parsePage(html_content, authorName_list, likeNr_list, URL_list, userURL_list, num):
"""
解析网页内容并更新数据列表。
Args:
html_content (str): 当前页面的HTML内容
authorName_list (list): 存储作者名字的列表
likeNr_list (list): 存储获赞数量的列表
URL_list (list): 存储笔记URL的列表
userURL_list (list): 存储用户URL的列表
qbar (tqdm): 进度条对象
num (int): 需要爬取的笔记数量
Returns:
None: 数据存储在传入的列表中
"""
response = Selector(text=html_content)
divs = response.xpath('//div[contains(@class, "feeds-container")]/section/div')# 选中网页中包含笔记信息的部分
# 遍历divs获取每一篇笔记的信息
for div in divs:
if len(URL_list) >= num:
break
if div.xpath('.//span[contains(text(), "大家都在搜")]'):
continue
# 选择并提取网页数据
try:
author_name = div.xpath('.//a[contains(@class, "author")]/span[contains(@class, "name")]/text()').get()# 作者名字
like_nr = div.xpath('.//span[contains(@class, "count")]/text()').get()# 获赞数量
url = div.xpath('.//a[contains(@class, "cover")]/@href').get()# 笔记URL
user_url = div.xpath('.//a[contains(@class, "author")]/@href').get()# 用户URL
authorName_list.append(author_name)
likeNr_list.append(like_nr)
URL_list.append(url)
userURL_list.append(user_url)
time.sleep(0.35)
except:
pass
return True
authorName_list, likeNr_list, URL_list, userURL_list = [], [], [], []
qbar = tqdm(total=num, desc="已获取的笔记数量...")
# 检查是否已经爬取足够数量的笔记,或是否已经达到页面底部
while len(URL_list) < num:
if '- THE END -' in browser.page_source:
print(f"当前与{key_word}有关的笔记数量少于 {num}")
print('检查时间:',time.ctime())
break
parsePage(browser.page_source, authorName_list, likeNr_list, URL_list, userURL_list, num)
qbar.update(1)
if len(URL_list) < num:
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')# 模拟鼠标滚动
time.sleep(random.uniform(3, 5))
if len(URL_list) > num:
URL_list = URL_list[:num]
authorName_list = authorName_list[:num]
likeNr_list = likeNr_list[:num]
userURL_list = userURL_list[:num]
qbar.close()
这段代码发生了以下错误,请帮我改正
LookupError Traceback (most recent call last)
Cell In[11], line 57
54 print('检查时间:',time.ctime())
55 break
---> 57 parsePage(browser.page_source, authorName_list, likeNr_list, URL_list, userURL_list, num)
58 qbar.update(1)
60 if len(URL_list) < num:
Cell In[11], line 17, in parsePage(html_content, authorName_list, likeNr_list, URL_list, userURL_list, num)
1 def parsePage(html_content, authorName_list, likeNr_list, URL_list, userURL_list, num):
2 """
3 解析网页内容并更新数据列表。
4
(...)
15 None: 数据存储在传入的列表中
16 """
---> 17 response = Selector(text=html_content)
18 divs = response.xpath('//div[contains(@class, "feeds-container")]/section/div')# 选中网页中包含笔记信息的部分
20 # 遍历divs获取每一篇笔记的信息
File d:\anaconda3\Lib\site-packages\scrapy\selector\unified.py:97, in Selector.__init__(self, response, text, type, root, **kwargs)
94 if root is not _NOT_SET:
95 kwargs["root"] = root
---> 97 super().__init__(text=text, type=st, **kwargs)
File d:\anaconda3\Lib\site-packages\parsel\selector.py:496, in Selector.__init__(self, text, type, body, encoding, namespaces, root, base_url, _expr, huge_tree)
493 msg = f"text argument should be of type str, got {text.__class__}"
494 raise TypeError(msg)
--> 496 root, type = _get_root_and_type_from_text(
497 text,
498 input_type=type,
499 base_url=base_url,
500 huge_tree=huge_tree,
501 )
502 self.root = root
503 self.type = type
File d:\anaconda3\Lib\site-packages\parsel\selector.py:377, in _get_root_and_type_from_text(text, input_type, **lxml_kwargs)
375 assert input_type in ("html", "xml", None) # nosec
376 type = _xml_or_html(input_type)
--> 377 root = _get_root_from_text(text, type=type, **lxml_kwargs)
378 return root, type
File d:\anaconda3\Lib\site-packages\parsel\selector.py:329, in _get_root_from_text(text, type, **lxml_kwargs)
326 def _get_root_from_text(
327 text: str, *, type: str, **lxml_kwargs: Any
328 ) -> etree._Element:
--> 329 return create_root_node(text, _ctgroup[type]["_parser"], **lxml_kwargs)
File d:\anaconda3\Lib\site-packages\parsel\selector.py:110, in create_root_node(text, parser_cls, base_url, huge_tree, body, encoding)
107 body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
109 if huge_tree and LXML_SUPPORTS_HUGE_TREE:
--> 110 parser = parser_cls(recover=True, encoding=encoding, huge_tree=True)
111 root = etree.fromstring(body, parser=parser, base_url=base_url)
112 else:
File d:\anaconda3\Lib\site-packages\lxml\html\__init__.py:1887, in HTMLParser.__init__(self, **kwargs)
1886 def __init__(self, **kwargs):
-> 1887 super().__init__(**kwargs)
1888 self.set_element_class_lookup(HtmlElementClassLookup())
File src\\lxml\\parser.pxi:1806, in lxml.etree.HTMLParser.__init__()
File src\\lxml\\parser.pxi:858, in lxml.etree._BaseParser.__init__()
LookupError: unknown encoding: 'b'utf8''
最新发布