写了一个小程序,获取社区(青阳县论坛)房产信息,琢磨着房产信息能不能获得有价值的信息呢?后续分析分析
#!/usr/bin/env python3
#-*- conding:utf-8 -*-
'''
输入页面数->采集每个页面的售房信息
版本2:函数
注意点:命名规则、函数式编程、异常处理
'''
import csv
import requests
import re
import sys
from bs4 import BeautifulSoup
from requests import ConnectionError, ReadTimeout
'''
参数设置
'''
pages = 5
url = 'http://www.ahqy.cn/forum.php?mod=forumdisplay&fid=527&sortid=106&sortid='\
'106&filter=sortid&page={}'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:62.0)Gecko/201'\
'00101 Firefox/62.0',}
csv_file_name = 'houses_info.csv'
csv_info_header = ['地址', '房屋类型', '面积', '价格', '装修情况', '屋内设施',\
'楼层', '坐向']
# 将头信息写入csv文件中
with open(csv_file_name, 'w', encoding = 'utf-8', newline='') as csv_obj:
csv_write = csv.writer(csv_obj)
csv_write.writerow(csv_info_header)
def get_text_from_url(url):
# 从链接中获取网页文本
try:
response = requests.get(url, headers=headers, timeout=30)
if response.status_code == 200:
return response.text
else:
print('get page failed.', response.status_code)
return None
except (ConnectionError, TimeoutError) as e:
print('crawling failed', url, e)
return None
def main():
# 循环处理每个页面
for num in range(1, pages):
# 从当前网页找到所有房产信息链接
text = get_text_from_url(url.format(num))
if text:
soup = BeautifulSoup(text, 'html.parser')
else:
continue # 当前网页没找到,进入下次循环
hrefs = soup.find_all('a',{'onclick':'atarget(this)','class':'s xst'})
if hrefs:
hrefs = [href.get('href') for href in hrefs]
else:
continue # 当前网页没发现链接,进入下次循环
# 从每个房产信息获取信息
for href in hrefs:
text = get_text_from_url(href)
if text:
soup = BeautifulSoup(text, 'html.parser')
else:
continue
info_table = soup.find_all("table", {"summary":"分类信息"})
if info_table:
info_table = info_table[0] # 从中获取第一个元素,即表格信息
else:
print('没有找到当前表格信息:', href)
continue
# 基本的数据清理
td_infos = info_table.find_all("td")
row_write_in_csv = []
# 丢弃最后一个电话信息,丢弃空字符、 、-
for td_info in td_infos[:-1]: # 最后一个电话信息丢弃
td = td_info.string
td = td.strip(' ')
if td == ' ' or td == '-':
td = ''
row_write_in_csv.append(td)
with open(csv_file_name, 'a', encoding = 'utf-8', newline='') as csv_obj:
csv_write = csv.writer(csv_obj)
csv_write.writerow(row_write_in_csv)
if __name__ == '__main__':
main()