案例
斗图啦爬取
只要两个标签不挨着,就用//,挨着的话,就用/,还有,@是艾特属性的,像src啊,a啊这些
保存数据的时候,什么时候用response.text ,什么时候用response.content?
说如果想取文本数据可以通过response.text 如果想取图片,文件,则可以通过 response.content
#取标题
"""
//div[@class = "col-sm-9 center-wrap"]//a/div[@class="random_title"]/text()
"""
#取所有的a标签
"""
//div[@class = "col-sm-9 center-wrap"]//a
"""
#图片
"""
//div[@class = "col-sm-9 center-wrap"]//a//div[@class="random_article"]//img/@src
"""
import requests
from lxml import etree
#解析的时候需要
import os
class DTSpider():
def __init__(self):
self.url = 'https://www.doutula.com/article/list/?page='
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
#发送请求
def send_request(self,url):
response = requests.get(url=url,headers = self.headers)
return response
def parse_content(self,response):
html = response.text
content = etree.HTML(html)#解析
a_list = content.xpath('//div[@class = "col-sm-9 center-wrap"]//a')
print(a_list)#列表
for a in a_list:
title_list = a.xpath('./div[@class="random_title"]/text()')
#.表示接着上个xpath后面继续取值
img_list = a.xpath('.//div[@class="random_article"]//img/@data-original')
#保存,新建一个文件夹,title就是他的文件夹的名字
if title_list:
if not os.path.exists('doutu/'+title_list[0]):#因为title_list是列表,并且每个列表就有一个值
os.mkdir('doutu/'+title_list[0])
#先判断一下,没有的话就用这个名字创建
#注意,doutu需要自己手动创建,不然他找不到
#保存图片,但是下面这种保存图片的话,没有顺序。有时候人家的表情包是连着的,
#所以我们按顺序给他爬下来
# for pic in img_list:
# response = self.send_request(pic)#发送图片请求
# name = img_list[-20]
# self.save_content(response,name,'doutu/'+title_list[0])
#enumerate可以给图片安排索引
for index,pic in enumerate(img_list):
print(index,pic)
response = self.send_request(pic)#发送图片请求
name = str(index+1) +'_'+pic[-13:]
#因为第一张图片的索引是0,可能看着奇怪,就加上1
print(response)
self.save_content(response,name,'doutu/'+title_list[0])
def save_content(self,response,name,path):
with open(path+ '/' +name,'wb') as f:
f.write(response.content)
def start(self):
for i in range(1,2):
full_url = self.url + str(i)
response = self.send_request(full_url)
self.parse_content(response)
if __name__ == '__main__':
dt = DTSpider()
dt.start()
链家爬取
这个案例爬取下来得数据是要保存到数据库当中得,所以要创建数据库
create database db_lianjia charset utf8;
create table if not exists lianjia(
agentInfoList varchar(255),
title varchar(255),
houseInfo varchar(255),
dealDate varchar(255),
totalPrice varchar(255),
dealCycleTxt varchar(255),
positionInfo varchar(255),
unitPrice varchar(255)
)charset utf8;
import requests
from lxml import etree
import pymysql
import json
class LJSpider():
def __init__(self):
self.url = 'https://bj.lianjia.com/chengjiao/pg%d'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
}
#连接数据库
self.connection = pymysql.connect(host='localhost',
user='root',
password='123456',
db='db_lianjia',
)
self.cursor = self.connection.cursor()
self.pn = 1
def send_request(self,url):
response = requests.get(url=url,headers=self.headers)
if response.status_code == 200:
return response
def parse_content(self,response):
html = response.text
content = etree.HTML(html)#解析
with open('lianjia.html','w',encoding='utf-8') as f:
f.write(response.text)
li_list = content.xpath('//ul[@class="listContent"]/li')#列表
for li in li_list:
img_list = li.xpath('./a/img/@src')
if img_list:
img = img_list[0]
else:
img = ""
title = "".join(li.xpath('.//div[@class="title"]/a/text()'))
houseInfo = li.xpath('.//div[@class="houseInfo"]/text()')[0]
dealDate = li.xpath('.//div[@class="dealDate"]/text()')[0]
totalPrice = "".join(li.xpath('.//div[@class="totalPrice"]//text()'))
positionInfo = li.xpath('.//div[@class="positionInfo"]/text()')[0]
unitPrice = li.xpath('.//div[@class="unitPrice"]/text()')[0]
dealHouseTxt = "".join(li.xpath('.//div[@class="unitPrice"]//text()'))
dealCycleTxt = li.xpath('.//span[@class = "dealCycleTxt"]/text()')
agentInfoList = li.xpath('.//div[@class="agentInfoList"]/a/text()')[0]
#
dict = {}
dict['agentInfoList'] = agentInfoList
dict['title'] = title
dict['houseInfo'] = houseInfo
dict['dealDate'] = dealDate
dict['totalPrice'] = totalPrice
dict['dealCycleTxt'] = dealCycleTxt
dict['positionInfo'] = positionInfo
dict['unitPrice'] = dealHouseTxt
#dict['dealHouseTxt'] = dealHouseTxt
self.save_content(dict)
#next = content.xpath('//div[@class="page-box fr"]//a[last()]/@href')
next_text = content.xpath('//div[@class="page-box fr"]//div/@page-data')[0]
print(next_text)
totalPage = json.loads(str(next_text))['totalPage']
if self.pn < totalPage:
self.pn+=1
full_url = self.url % (self.pn)
response = self.send_request(full_url)
if response:
self.parse_content(response)
#curPage = json.loads(str(next_text))['curPage']
# if next_text == '下一页':
#
# response = self.send_request(next)
# if response:
# self.parse_content(response)
def save_content(self,dict):
sql = "insert into `db_lianjia` (`agentInfoList`,`title`,`houseInfo`,`dealDate`,`totalPrice`,`dealCycleTxt`,`positionInfo`,`unitPrice`) values (%s,%s,%s,%s,%s,%s,%s,%s)"
self.cursor.execute(sql,[v for v in dict.values()])
self.connection.commit()
def start(self):
full_url = self.url%(self.pn)
response = self.send_request(full_url)
if response:
self.parse_content(response)
if __name__ == '__main__':
lj = LJSpider()
lj.start()
xpath取文本
xpath取文本的小技巧
totalPrice = "".join(li.xpath('.//div[@class="totalPrice"]//text()'))
#这样取到的值完整
#如果像下面这么取值的话
totalPrice = li.xpath('.//div[@class="totalPrice"]//text()')[0]
#这么取只能取到数字,
注意想要把字典当中的值取出来,可以借助json.loads(),把json转换成字符串,而dumps()是把字符串转化成json
把链家用csv保存(另一个版本)
import requests
from lxml import etree
import urllib.parse
import urllib.parse
import json
# import urllib.parse
class LiaJiaSpider():
def __init__(self):
self.base_url = 'https://bj.lianjia.com/ershoufang/pg{}/'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
self.page = 0
def send_request(self, full_url):
print(full_url)
response = requests.get(url=full_url, headers=self.headers)
if response.status_code == 200:
return response
def parese_content(self, response):
html = etree.HTML(response.text)
with open('lianjia.html', 'w') as f:
f.write(response.text)
next = html.xpath('//div[@class="page-box house-lst-page-box"]/@page-data')[0]
li_list = html.xpath('//ul[@class="sellListContent"]/li')
info = []
for li in li_list:
ctx = {}
title = li.xpath('.//div[@class="title"]/a/text()')
if title:
title = title[0]
print(title)
ctx['title'] = title
price = li.xpath('.//div[@class="totalPrice"]/span/text()')
if price:
price = price[0]
print(price)
ctx['price'] = price
info.append(ctx)
self.save_content(info)
if self.page == 1:
self.totalPage = json.loads(next).get('totalPage')
# print(totalPage)
# if next == '下一页':
# href = html.xpath('//div[@class="page-box house-lst-page-box"]/a[last()]/@href')
# next_url = urllib.parse.urljoin(self.base_url, href)
# print(next_url)
# # self.send_request()
def save_content(self, info):
for i in info:
with open('info.csv', 'a') as f:
f.write(i.get('title') + '\t' + i.get('price') + '万' + '\n')
def start(self):
while True:
self.page += 1
full_url = self.base_url.format(self.page)
response = self.send_request(full_url)
self.parese_content(response)
if self.page == self.totalPage:
break
# for i in range(100, 101):
# full_url = self.base_url.format(i)
# print(full_url)
#
# response = self.send_request(full_url)
#
# self.parese_content(response)
if __name__ == '__main__':
ljs = LiaJiaSpider()
ljs.start()
蔬菜网爬取(指定时间爬取)post请求,上面是get
import requests
import json
import time
class PriceSpider():
def __init__(self):
self.url = 'http://www.cncyms.cn/pages.php'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
self.form_data = {
'pageNum': 0,
'pname': '',
'reltime': '蔬菜',
}
def get_time(self, date):
# 转为时间数组
timeArray = time.strptime(date, "%Y-%m-%d")
timeStamp = int(time.mktime(timeArray))
return timeStamp # 1381419600
def send_request(self):
response = requests.post(self.url, data=self.form_data, headers=self.headers)
if response.status_code == 200:
return response
def parse_content(self, response):
json_content = response.json() # 变成字典了
# print(json_content)
list = []
for data in json_content.get('list'):
print(data)
releasetime = self.get_time(data.get('ReleaseTime'))
if self.start_time != '1' and self.end_time != '1':
if releasetime <= self.end_time and releasetime >= self.start_time:
print(data)
list.append(data)
content = json.dumps(list, ensure_ascii=False) # 不让他用这样的编码方式
self.save_content(content)
# content = json.dumps(json_content, ensure_ascii=False) # 不让他用这样的编码方式
# self.save_content(content)
def save_content(self, content):
with open('price.txt', 'a', encoding='utf8') as f:
f.write(content + '\n')
def start(self):
try:
num = int(input('请输入要爬取多少页'))
name = input('请输入指定菜品,输入:1默认全部')
self.start_time = input('请输入开始时间,输入1默认全部 2020-06-02') # 20200602
self.end_time = input('请输入结束时间,输入1默认全部 2020-06-02') # 2020-06-05
if self.start_time != '1' and self.end_time != '1':
self.start_time = self.get_time(self.start_time)
self.end_time = self.get_time(self.end_time)
if self.start_time > self.end_time:
print('结束时间必须大于开始时间')
if not name == '1':
self.form_data['pname'] = name
for i in range(0, num):
self.form_data['pageNum'] = i
response = self.send_request()
self.parse_content(response)
except Exception as e:
print('输入有误', e)
if __name__ == '__main__':
ps = PriceSpider()
ps.start()
下厨房爬取
import requests
from lxml import etree
import urllib.parse
class XiaChuFangSpider():
def __init__(self):
self.url = 'https://www.xiachufang.com/category/40076/?page='
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
def send_request(self, full_url):
response = requests.get(full_url, headers=self.headers)
if response.status_code == 200:
return response
def parse_content(self, response):
html = etree.HTML(response.text)
href_list = html.xpath('//p[@class="name"]/a/@href')
for href in href_list:
detail_url = urllib.parse.urljoin(self.url, href)
print(detail_url)
response = self.send_request(detail_url)
self.parse_detail(response)
def parse_detail(self, response):
html = etree.HTML(response.text)
with open('xiachufang.html', 'w') as f:
f.write(response.text)
title = html.xpath('//h1/text()')[0]
tr_list = html.xpath('//div[@class="ings"]//tr')
print(tr_list)
zuoliao = ''
for tr in tr_list:
zuoliao += "\t".join(tr.xpath('./td//text()')).strip().replace('\n', '').replace(' ', '')
print(zuoliao)
step = "".join(html.xpath('//li[@class="container"]//p//text()'))
self.save_content(title, zuoliao, step)
def save_content(self, title, zuoliao, step):
with open("xiachufang/" + title + '.txt', 'w') as f:
f.write(zuoliao + "\n\n")
f.write(step + "\n\n")
def start(self):
for i in range(1, 2):
full_url = self.url + str(i)
print(full_url)
response = self.send_request(full_url)
self.parse_content(response)
if __name__ == '__main__':
xcfs = XiaChuFangSpider()
xcfs.start()