下载chromedriver的地址:
注意:Chrome和chromedriver的版本对应,将chromedriver放到Chrome安装目录下即可。
https://chromedriver.com/download#stable
1 爬取网页中的文本;
2 在必应上使用关键词检索数据,并抽取检索出的标题和超链接;
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
def get_text_page():
page_url = "https://www.oracle.com/artificial-intelligence/what-is-ai/"
driver = webdriver.Chrome(service=Service("C:/Program Files/Google/Chrome/Application/chromedriver.exe"))
driver.get(page_url)
# 设置睡眠时间
time.sleep(3)
# 设置编码方式, bytes
content = driver.page_source.encode('utf-8')
# 使用Soupb解析html
dom_bs = BeautifulSoup(content, 'lxml')
# 获取页面中的纯文本
print(dom_bs.text)
driver.close()
get_text_page()
# 进入必应首页并搜索关键词
def driver_open(key_word):
# 使用关键词获取数据
url = "https://cn.bing.com/search?q="+key_word+"&ensearch=1&FORM=BESBTB"
driver = webdriver.Chrome(service=Service("C:/Program Files/Google/Chrome/Application/chromedriver.exe"))
driver.get(url)
# 设置睡眠时间,等待浏览器访问数据
time.sleep(2)
# 设置编码方式, bytes
content = driver.page_source.encode('utf-8')
# 使用Soupb解析html
dom_bs = BeautifulSoup(content, 'lxml')
# 关闭浏览器
driver.close()
# 解析返回结果,ResultSet
li_list_rs = dom_bs.find_all(name="li", attrs={"class":"b_algo"})
for li_tag in li_list_rs:
# 解析li
li_bs = BeautifulSoup(str(li_tag), 'lxml')
# 获取标题头
li_h2_rs = li_bs.find_all(name="h2")
ls_h2_bs = BeautifulSoup(str(li_h2_rs[0]), 'lxml')
# 查找a标签
li_h2_a_rs = ls_h2_bs.find_all(name="a")
h2_a_tag = li_h2_a_rs[0]
# 获取标签中的连接地址和文本
print(h2_a_tag.attrs["href"])
print(h2_a_tag.text)
return ""
driver_open("text")