#!/usr/bin/env python
#-*- coding:utf-8 -*-
import requests,time
from bs4 import BeautifulSoup
headers = {
"User-Agent":'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10'
}
class zfBase(object):
'''基础设置'''
def __init__(self,url=None):
self.url = url
self.run()
def __url(self):
global headers
if self.url:
html = requests.get(self.url,headers=headers)
soup = BeautifulSoup(html.text,'lxml')
return soup
else:
return '网址有误.'
def run(self):
return self.__url()
def getTagUrl(html):
'''判断有多少待采集页面'''
pageNums = html.select('#page_list > div.pagination_v2.pb0_vou > a')[-2].string
return int(pageNums)
def zfSpider(listUrl):
'''爬取详情页内容'''
if isinstance(listUrl,list):
for url in listUrl:
html = zfBase(url).run()
time.sleep(2)
titles = html.select('div.pho_info > h4 > em')[0].string
address = html.select('div.pho_info > p')[0]['title']
money = html.select('div.day_l > span')[0].string
onepic = html.select('img#curBigImage')[0]['src']
peoplepic = html.select('div.member_pic > a > img')[0]['src']
sex = html.select('div.member_pic > div')[0]['class']
if 'member_ico' in sex:
sex = '男'
else:
sex = '女'
with open('xiaozhu.txt','a') as fp:
fp.write(titles + '\t' + address + '\t' + money + '\t'
+ sex +'\t' + onepic + '\t' + peoplepic + '\n')
else:
print('URL错误')
def run(url):
'''整体运行'''
z = zfBase(url % 1)
for num in range(1,getTagUrl(z.run()) + 1):
z = zfBase(url % num).run()
pageUrls = [pageUrl['href'] for pageUrl in z.select('a.resule_img_a')]
time.sleep(2)
zfSpider(pageUrls)
if __name__ == '__main__':
run('http://bj.xiaozhu.com/search-duanzufang-p%s-0/?startDate=2018-01-01&endDate=2018-01-02')
requests爬取小猪租房--记录
最新推荐文章于 2020-05-22 17:54:32 发布