涉及
1、urllib 中的 request
2、requests
3、lxml 中的 etree
4、bs4 中的 BeautifulSoup
一、定制请求头
1、urllib 中的 request
# -*- coding=utf-8 -*-
from urllib import request
url = 'http://example.webscraping.com'
# 默认user-agent
# req = request.Request(url)
# 制定user-agent
headers = {'User_agent':'hahahah'}
req = request.Request(url,headers=headers)
user_agent = req.get_header('User_agent')
print(req.headers)
2、requests
# -*- coding=utf-8 -*-
import requests
def main():
# 定制请求头
url = 'http://www.baidu.com'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
r = requests.get(url, headers=headers)
print(r.request.headers['user-agent']) # 输出定制后的请求头中的user-agent信息
# print(r.text)
# print(r.encoding)
if __name__ == '__main__':
main()
二、解析网页
3、lxml 中的 etree
#-*- coding:utf-8 -*-
from lxml import etree # lxml, XPath
# from xml.dom import minidom # dom, minidom
#----------------lxml, XPath -------------------
html = etree.parse(r"C:\Users\JADE\Videos\CRAWLER\16. Crawler\craw_day02\bookStore.xml")
# 使用XPath获取需要得到的信息
# 使用XPath获取需要得到的信息
result = html.xpath('/bookstore/book[price>29.00]/price')
print(len(result))
print(type(result))
# print(result)
print(result[0].text)
print(result[0].getparent().tag) #获取父节点的tag(标签)
print(result[0].getprevious().text) #获取前一个兄弟节点的text(文本内容)
print(result[0].xpath('string(.)'))
print(result[1].xpath('string(.)'))
resulttest = html.xpath('/bookstore/book/title[@lang="eng"]')
print(resulttest[0].text)
4、bs4 中的 BeautifulSoup
# -*- coding=utf-8 -*-
from bs4 import BeautifulSoup as BS
import re
doc = ['<html><head><title>Page title</title></head>',
'<body><p id="firstpara" align="center">This is paragraph <b>one</b>.',
'<p id="secondpara" align="blah">This is paragraph2 <b>two</b>.',
'</html>']
soup = BS(''.join(doc), 'html.parser')
print(soup.prettify()) # 格式化输出文档
print(soup.title) # <title>Page title</title>
print(soup.title.string) # Page title
print(soup.title.parent.name) # head
print(soup.head) # <head><title>Page title</title></head>
print(soup.head.string) # Page title
print(soup.p)
print(soup.p['align'])
print(soup.p['id'])
print(soup.p.text)
print(soup.p.find(id='secondpara'))
print(soup.find(id='secondpara').b)
# 疑问 p中第二个p该怎么获取到,除了定位id以外的方法