任务描述
爬取300个房源信息,每页具体信息如下

Python代码
from bs4 import BeautifulSoup
import requests
urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1, 11, 1)]
def get_lorder_sex(class_name):
if class_name == ['member_ico']:
return '男'
elif class_name == ['member_ico1']:
return '女'
def get_attar(url):
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text, 'lxml')
titles = soup.select('div.pho_info > h4 > em')
locations = soup.select('div.pho_info > p > span')
prices = soup.select('div.day_l > span')
images = soup.select('div.pho_show_big > div > img')
lorder_names = soup.select('div.w_240 > h6 > a')
lorder_images = soup.select('div.member_pic > a > img')
lorder_genders = soup.select('div.member_pic > div')
for title, location, price, image, lorder_name, lorder_image, gender in zip(titles, locations, prices, images, lorder_names, lorder_images, lorder_genders):
data = {
'title': title.get_text(),
'location': location.get_text(),
'price': price.get_text(),
'image': image.get('src'),
'lorder_name': lorder_name.get_text(),
'lorder_image': lorder_image.get('src'),
"gender": get_lorder_sex(gender.get("class"))
}
print data
for url in urls:
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text, 'lxml')
url_links = soup.select('a.resule_img_a')
for url_link in url_links:
get_attar(url_link.get('href'))
结果展示
这里仅截取其中两个房源信息

不足之处
Pycharm的控制台中,对中文汉字,只能显示其字符编码,未能显示中文