Python 爬58同城城市租房信息

最新推荐文章于 2021-05-06 13:55:44 发布

weixin_30940783

最新推荐文章于 2021-05-06 13:55:44 发布

阅读量199

点赞数

CC 4.0 BY-SA版权

文章标签： python

原文链接：http://www.cnblogs.com/cutesnow/p/7161692.html

本文介绍了一种使用Python进行网页爬取的方法，具体目标为58同城网站上的租房信息。通过解析网页结构并利用lxml和cssselect库定位所需数据，该爬虫能够自动抓取房屋标题、价格、支付方式等详细信息，并将结果保存为CSV文件。

爬取完会自动生成csv电子表格文件，含有房价、押付、链接等信息

环境

py2.7

pip install lxml

pip install cssselect

 1 #coding:utf-8
 2 import csv
 3 import urllib2
 4 import lxml.html
 5 import time
 6 import sys
 7 from lxml.cssselect import CSSSelector
 8 import threading
 9 reload(sys)
10 sys.setdefaultencoding('utf8')
11 
12 print "请输入要爬取得城市简称例如bj（北京）："
13 CITY=str(raw_input(">>>"))
14 def download(url, user_agent='Google', num_retries=2):
15 
16     headers = {'User-agent': user_agent}
17     request = urllib2.Request(url, headers=headers)
18     try:
19         html = urllib2.urlopen(request).read()
20     except urllib2.URLError as e:
21         html = None
22         if num_retries > 0:
23             if hasattr(e, 'code') and 500 <= e.code < 600:
24                 return download(url, num_retries-1)
25     return html
26 
27 
28 def get_data(url):
29     html_text_detail = download(url)
30     try:
31         tree = lxml.html.fromstring(html_text_detail)
32         house_ext = CSSSelector('div.house-pay-way > span:nth-child(3)')
33         house_title = CSSSelector('div.main-wrap > div.house-title > h1')
34         house_pay_way1 = CSSSelector('div.house-pay-way > span:nth-child(1)')
35         house_pay_way2 = CSSSelector('div.house-pay-way > span:nth-child(2)')
36         print house_title(tree)[0].text_content()
37         print '%s|%s' % (house_pay_way1(tree)[0].text_content(), house_pay_way2(tree)[0].text_content())
38 
39         for i in range(7):
40             for j in range(2):
41                 css = 'div.house-desc-item > ul.f14 > li:nth-child(%s) > span:nth-child(%s)' % (i+1, j+1)
42                 house_info = CSSSelector(css)
43                 data = [
44                 ('标题 ： ',house_title(tree)[0].text_content(), '#',url),
45                 ('价格： ',house_pay_way1(tree)[0].text_content(), '#'),
46                 ('压付： ',house_pay_way2(tree)[0].text_content(), '#'),
47                 ('详情： ',house_info(tree)[0].text_content().replace(' ', ''), '#')]
48                 with open('%s_houses.csv'%CITY,'ab+') as csvfile:
49                     writer = csv.writer(csvfile,lineterminator='\n')
50                     writer.writerows(data)
51 
52     except TypeError as e:
53         pass
54     except IndexError as e:
55         pass
56 
57 def get_url(html):
58     tree = lxml.html.fromstring(html)
59     sel = CSSSelector('div.mainbox > div.main > div.content > div.listBox > ul.listUl > li > div.des > h2 > a')
60     url_list = []
61     for i in sel(tree):
62         if i.get('href') not in url_list:
63             url_list.append(i.get('href'))
64     return url_list
65 
66 
67 if __name__ == '__main__':
68     url_index = 'http://%s.58.com/chuzu/'%CITY
69     html_text_list = download(url_index)
70     url_list = get_url(html_text_list)
71 
72     for url_detail in url_list:
73         thr = threading.Thread(target=get_data, args=(url_detail,))
74         thr.start()
75 
76         time.sleep(0.001)