如何使用selenium爬取某门户网站nba资讯
由于现在网页多是采取动态加载的形式,如用普通爬取可能会无法获取相关信息,如在某门户网站,来到资讯页面,用调试手段尝试爬取发布时间的相关信息,爬取的结果很可能就是空的字符串或空列表,而selenium由于是模拟人浏览网页的第三方库,其page_source可以直观的获取你所见网页下的源代码信息,再利用lxml下的etree相关方法将其转成html元素,再用xpath解析就很简单了;
这里的思路是先到nba总的页面,从总的页面中获取新闻资讯板块的链接,然后根据链接循环遍历用selenium新开一个窗口(注意跳转操作)然后获取标题,文本内容,作者,发布时间,图片链接等信息并将这些信息组合保存成想要的格式;然后关闭刚才的窗口,跳转回主页,循环往复;话不多说直接上代码
from selenium import webdriver
from lxml import etree
import time
import os
from urllib import request
class NBA(object):
def __init__(self):
path = r'F:/study/soton/chromedriver.exe'
self.url = 'https://sports.qq.com/nba/'
self.driver = webdriver.Chrome(executable_path=path)
def parse_list(self):
self.driver.get(self.url)
time.sleep(2)
source = self.driver.page_source
self.html_list = etree.HTML(source)
alist = []
lis = self.html_list.xpath('//div[@class="col-left"]/ul//li')
for l in lis:
url_text = l.xpath('./a/@href')
alist.append(url_text)
return alist
def parse_firstpage(self):
url = self.parse_list()[0]
self.driver.execute_script('window.open(%s)' % url)
self.driver.switch_to.window(self.driver.window_handles[1])
time.sleep(3)
source = self.driver.page_source
html_fp = etree.HTML(source)
title = html_fp.xpath('//div[@class="hd"]/h1/text()')[0]
pub_time = html_fp.xpath('//span[@class="a_time"]/text()')[0]
author = html_fp.xpath('//div[@class="qq_editor"]/text()')[0]
content = []
contents = html_fp.xpath('//p[@class="text"]')
for c in contents:
text_p = c.xpath('./text()')
if len(text_p) > 0:
text_p = text_p[0]
else:
text_p = '\n'
content.append(text_p)
content = ''.join(content)
img_urls = []
images = html_fp.xpath('//p[@align="center"]')
for i in images:
img = i.xpath('./img/@src')[0]
img = 'https:' + img
img_urls.append(img)
self.store_text(title,pub_time,author,content)
self.store_img(img_urls,title)
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_detail(self):
alist = self.parse_list()[1:]
for url in alist:
self.driver.execute_script('window.open(%s)'%url)
self.driver.switch_to.window(self.driver.window_handles[1])
time.sleep(3)
source = self.driver.page_source
html_detail = etree.HTML(source)
title = html_detail.xpath('//div[@class="LEFT"]/h1/text()')[0]
year = html_detail.xpath('//div[@class="left-stick-wp"]/div[contains(@class,"year")]//text()')[0]
date = html_detail.xpath('//div[@class="md"]//text()')
date = ''.join(date)
pub_time = year+'/'+date
try:
author = html_detail.xpath('//a[@class="author"]/div/text()')[0]
except:
author = ''
content=[]
contents = html_detail.xpath('//div[@class="content-article"]/p[@class="one-p"]')
for c in contents:
text_p = c.xpath('.//text()')[0]
content.append(text_p)
content = ''.join(content)
img_urls = []
for i in contents:
try:
img = i.xpath('./img[@class="content-picture"]/@src')[0]
img = 'https:' + img
img_urls.append(img)
except:
print('该行没有图片')
self.store_text(title,pub_time,author,content)
self.store_img(img_urls,title)
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
def store_text(self,t,p,a,c):
text_content = t+'\n'+p+'\t'+a+'\n'+c
with open('F:/tmp/nba/nbatext/%s.txt'%t,'w',encoding='utf-8') as fp:
fp.write(text_content)
print('文章存储成功')
def store_img(self,l,t):
img_path = os.mkdir('F:/tmp/nba/nbaimages/%s'%t)
for i in range(len(l)):
try: #F:/tmp/nba/nbaimages/title/title01.png
request.urlretrieve(l[i],'F:/tmp/nba/nbaimages/{}/{}{}.png'.format(t,t,str(i)))
time.sleep(1)
print('下载成功')
except:
print('下载不成功')
if __name__ == '__main__':
a = NBA()
a.parse_firstpage()
a.parse_detail()```
本文介绍如何使用Selenium库爬取某门户网站的NBA资讯,包括新闻标题、文本内容、作者、发布时间及图片链接,通过模拟浏览器行为解决动态加载内容的爬取难题。
1456

被折叠的 条评论
为什么被折叠?



