import requests
from bs4 import BeautifulSoup
def del_span(l):
while True:
if '\n' in l:
l.remove('\n')
else:
break
return l
if __name__ == '__main__':
url = 'https://book.douban.com/'
html = requests.get(url).text
soup = BeautifulSoup(html, 'lxml')
# print(soup.prettify())
# print(soup.title.string.strip())
# print(soup.title)
# print(type(soup.title))
# # print(soup.head)
# print(soup.a)
# print(soup.a['href'])
# print(soup.p['class'])
#
# # 子孙节点嵌套 返回列表对象
# l = soup.div.div.div.contents
# # 子孙节点嵌套,返回的是迭代器对象
# l2 = soup.div.div.div.children
# l1 = del_span(l)
# for i, item in enumerate(l1, start=1):
# print(i, ':', item)
#
# # 子孙节点嵌套,返回一个子孙节点列表迭代器,子孙节点也会单独占用一个元素输出
# l3 = soup.div.div.div.descendants
# for i, item in enumerate(l3, start=1):
# # if item == "\n":
# # continue
# print(i, ':', item)
# 获取父节点方法,返回一个类对象 bs4.element.Tag
# par = soup.a.parent
# print(type(par))
#
# # 获取所有的祖先节点, 返回生存器对象
# pars = soup.a.parents
# print(list(enumerate(pars)))
#
# # 获取兄弟节点
# sib = soup.div.next_siblings # 所有弟节点
# sibs = soup.div.previous_siblings # 所有兄节点
# print(sib)
# print(sibs)
# 选择器 find_all 返回元素集合对象
# uls = soup.div.div.find_all('ul')
# print(type(uls))
# print(uls[0])
# 属性查找元素
# atts = soup.div.div.find_all(attrs={'class': 'more-items'})
# ids = soup.div.div.find_all(id='123456')
# classes = soup.div.div.find_all(clsss_='more-items')
# print(atts)
# text属性
# texts = soup.div.div.find_all(text='登录')
# print(texts)
# 返回单个元素 find
# one = soup.div.div.find('ul')
# print(one)
# 返回所有的父节点和祖先节点
# find_parents()
# find_parent()
# 返回第一个兄弟节点和所有的弟节点
# find_next_sibling()
# find_next_siblings()
# 返回第一个兄节点和所有的兄节点
# find_previous_sibling()
# find_previous_siblings()
# 返回节点后第一个和所有符合条件的节点
# find_next()
# find_all_next()
# 返回节点前第一个和所有符合条件的节点
# find_previous()
# find_all_previous()
# CSS选择器
# soup.select('ul li')
# soup.select('.clsss1 .clsss2')
# soup.select('#id1 .class1')
# 获取标签内的文本信息
# p = soup.select('p')
# for p1 in p:
# print(p1.get_text().replace(' ', '').strip())
from bs4 import BeautifulSoup
def del_span(l):
while True:
if '\n' in l:
l.remove('\n')
else:
break
return l
if __name__ == '__main__':
url = 'https://book.douban.com/'
html = requests.get(url).text
soup = BeautifulSoup(html, 'lxml')
# print(soup.prettify())
# print(soup.title.string.strip())
# print(soup.title)
# print(type(soup.title))
# # print(soup.head)
# print(soup.a)
# print(soup.a['href'])
# print(soup.p['class'])
#
# # 子孙节点嵌套 返回列表对象
# l = soup.div.div.div.contents
# # 子孙节点嵌套,返回的是迭代器对象
# l2 = soup.div.div.div.children
# l1 = del_span(l)
# for i, item in enumerate(l1, start=1):
# print(i, ':', item)
#
# # 子孙节点嵌套,返回一个子孙节点列表迭代器,子孙节点也会单独占用一个元素输出
# l3 = soup.div.div.div.descendants
# for i, item in enumerate(l3, start=1):
# # if item == "\n":
# # continue
# print(i, ':', item)
# 获取父节点方法,返回一个类对象 bs4.element.Tag
# par = soup.a.parent
# print(type(par))
#
# # 获取所有的祖先节点, 返回生存器对象
# pars = soup.a.parents
# print(list(enumerate(pars)))
#
# # 获取兄弟节点
# sib = soup.div.next_siblings # 所有弟节点
# sibs = soup.div.previous_siblings # 所有兄节点
# print(sib)
# print(sibs)
# 选择器 find_all 返回元素集合对象
# uls = soup.div.div.find_all('ul')
# print(type(uls))
# print(uls[0])
# 属性查找元素
# atts = soup.div.div.find_all(attrs={'class': 'more-items'})
# ids = soup.div.div.find_all(id='123456')
# classes = soup.div.div.find_all(clsss_='more-items')
# print(atts)
# text属性
# texts = soup.div.div.find_all(text='登录')
# print(texts)
# 返回单个元素 find
# one = soup.div.div.find('ul')
# print(one)
# 返回所有的父节点和祖先节点
# find_parents()
# find_parent()
# 返回第一个兄弟节点和所有的弟节点
# find_next_sibling()
# find_next_siblings()
# 返回第一个兄节点和所有的兄节点
# find_previous_sibling()
# find_previous_siblings()
# 返回节点后第一个和所有符合条件的节点
# find_next()
# find_all_next()
# 返回节点前第一个和所有符合条件的节点
# find_previous()
# find_all_previous()
# CSS选择器
# soup.select('ul li')
# soup.select('.clsss1 .clsss2')
# soup.select('#id1 .class1')
# 获取标签内的文本信息
# p = soup.select('p')
# for p1 in p:
# print(p1.get_text().replace(' ', '').strip())