#encoding UTF-8
import urllib.request
import http.cookiejar
url = "http://www.baidu.com"
print('第一种方法')
response1 = urllib.request.urlopen(url)
print(response1.getcode())
print(response1.read())
print('第二种方法')
request = urllib.request.Request(url)
request.add_header("user-agent",'Mozilla/5.0')
response2 = urllib.request.urlopen(request)
print(response2.getcode())
print(response2.read())
print('第三种方法')
# 初始化一个CookieJar来处理Cookie
cookie=http.cookiejar.CookieJar()
#实例化一个全局opener
handler=urllib.request.HTTPCookieProcessor(cookie)
opener=urllib.request.build_opener(handler)
# 获取cookie
# 访问主页 自动带着cookie信息
result = opener.open('http://www.baidu.com')
print(result.read())
print(cookie)
print('-------------------------------------')
from bs4 import BeautifulSoup
import re
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
links = soup.find_all('a')
print('&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&')
for link in links:
print(link['href'])
print(link.get_text())
node = soup.find('a',href=re.compile(r'ill'))
print(node['href'])
print(node.get_text())
p_node = soup.find('p',class_="title")
print(p_node.get_text())
python爬虫技术-beautifulsoup的应用
最新推荐文章于 2023-04-25 08:15:00 发布
本文介绍了使用Python进行网页爬取的三种方法,包括直接请求、添加User-Agent以及利用CookieJar处理Cookie。此外,还展示了如何使用BeautifulSoup解析HTML文档,提取链接及文本内容,并通过正则表达式定位特定元素。
220

被折叠的 条评论
为什么被折叠?



