#!/usr/bin/env python
#-*- coding:utf-8 -*-
import requests,time,pymongo
from bs4 import BeautifulSoup
headers = {
"User-Agent":'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10'
}
class baseItem(object):
'''基础设置'''
def __init__(self,url):
self.url = url
def getHtml(self):
global headers
html = requests.get(self.url,headers=headers)
soup = BeautifulSoup(html.text,'lxml')
return soup
def pymongoData(self,data):
client = pymongo.MongoClient('localhost',27017)
zfdata = client['zfdata']
sheet_tab = zfdata['sheet_tab']
sheet_tab.insert_one(data)
def getUrl(listhtml):
'''获取详情页链接'''
urls = listhtml.select('a.resule_img_a')
return [url['href'] for url in urls]
def zfSpider(zfdata):
'''获取详情页数据'''
g = baseItem(zfdata).getHtml()
title = g.select('div.pho_info > h4')[0].text.strip()
address = g.select('span.pr5')[0].text.strip()
money = g.select('div.day_l > span')[0].text.strip()
data = {
'title':title
,'address':address
,'money':int(money)
}
return data
def findData():
client = pymongo.MongoClient('localhost', 27017)
zfdata = client.get_database('zfdata')
sheet_tab = zfdata.get_collection('sheet_tab')
for item in sheet_tab.find({'money':{'$gte':500}}):
print(item)
def run(url):
b = baseItem(url)
for g in getUrl(b.getHtml()):
b.pymongoData(zfSpider(g))
if __name__ == '__main__':
for i in range(1,4):
run('http://bj.xiaozhu.com/search-duanzufang-p%s-0/?startDate=2018-01-03&endDate=2018-01-04' % i)
time.sleep(3)
findData()
requests爬小猪租房存入Mongodb--记录
最新推荐文章于 2019-12-13 22:10:16 发布
本文介绍了一个使用Python实现的短租房信息爬虫案例。该爬虫利用requests库进行网页请求,BeautifulSoup进行HTML解析,并通过pymongo将抓取的数据存储到MongoDB数据库中。文章详细展示了如何从指定网站上抓取房源链接,进一步获取房源详情页面的标题、地址及价格等信息。
3万+

被折叠的 条评论
为什么被折叠?



