1、导入包
import requests
from lxml import etree
import pymysql
import chardet
2、获取单页html
def get_one_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}
response = requests.get(url, headers=headers)
response.encoding = chardet.detect(response.content)['encoding']
return response.text
3、解析html
def parse_one_page(html):
result = etree.HTML(html)
item = {
}
item['t1'] = result.xpath('//div[@class="el"]/p/span/a/text()')
item['t2'] = result.xpath('//div[@class="el"]/span[@class="t2"]/a/text()')
item['t3'] = result.xpath('//div[@class="el"]/span[@class="t3"]/text()')
t4 = result.xpath('//div[@class="el"]/span[@class="t4"]')
item['t4'] = []
for i in t4:
item['t4'].append(i.xpath('string(.)'))
item['t5'] = result.xpath('//div[@class="el"]/span[@class="t5"]/text()')
it