初学python和爬虫,正好赶上要在帝都租房,于是打算自己抓下链家的租房数据试试。看到网上有人抓取链家的二手房买卖数据,参考了下,不过我抓租房数据的时候发现还比较简单,不需要模拟登陆,链家也没怎么反爬虫,因而一路还比较顺利。
总体思路,虽然链家没有采用太多的反爬虫技术,但是基本的限制IP访问密度还是做了的,所以得动用代理,这么一来,抓取效率也必然降低,所以得采用多线程。实现的时候先实现代理的抓取,然后实现单线程单页面的抓取,接着改为多线程,再结合代理。
先看下代理部分。网上搜索了下,几个不断更新的免费的代理平台有,快代理、西刺代理和proxy360。那就针对这几个网站,分别封装类,提取代理IP和端口呗。限于初学,为了扎实基本功,就用了最笨的正则表达式提取数据。
先把代码全部贴出来:
# coding=utf-8
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'ATP'
import urllib
import urllib2
import re
import time
import threading
import socket
from bs4 import BeautifulSoup
import sys
import random
import getProxy
reload(sys)
sys.setdefaultencoding('utf-8')
BSparser = 'html.parser'
UserAgents=['Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36']
#obtain proxy from proxy360.cn
class proxy_proxy360:
def __init__(self):
self.region=['Brazil','China','America','Taiwan','Japan','Thailand','Vietnam','bahrein']
self.root_url = 'http://www.proxy360.cn/Region/'
self.qqproxy_url='http://www.proxy360.cn/QQ-Proxy'
self.msnproxy_url = 'http://www.proxy360.cn/MSN-Proxy'
def getUrlList(self,proxyType='normal',proxyLocation='China'):
url = []
if proxyType == 'normal' or proxyType == 'all':
if proxyLocation == 'all':
for ri in self.region:
url.append(self.root_url + ri)
else:
url.append(self.root_url + proxyLocation)
if proxyType == 'qq' or proxyType == 'all':
url.append(self.qqproxy_url)
if proxyType == 'msn' or proxyType == 'all':
url.append(self.msnproxy_url)
return url
def getProxy(self,url=[]):
items = []
try:
for urlitem in url:
print 'Get proxy from url: ' + urlitem
#request = urllib2.Request(urlitem,headers=headers)
request = urllib2.Request(urlitem)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
pattern = re.compile(r'<div.*?proxylistitem.*?>.*?>'
r'.*?>\s*(.*?)\s*</span>'
r'.*?>\s*(.*?)\s*</span>'
r'.*?>\s*(.*?)\s*</span>'
r'.*?>\s*(.*?)\s*</span>'
r'.*?>\s*(.*?)\s*</span>'
r'.*?>\s*(.*?)\s*</span>'
r'.*?>\s*(.*?)\s*</span>'
r'.*?>\s*(.*?)\s*</span>'
r'.*?/div>.*?>.*?<div.*?title="(.*?)">', re.S)
itemslist = re.findall(pattern,content)
for itemsnew in itemslist:
itemproperty = {}
itemproperty['ip']=itemsnew[0]
itemproperty['port'] = itemsnew[1]
itemproperty['anony_degree'] = itemsnew[2]
itemproperty['location'] = itemsnew[3]
itemproperty['updata_time'] = itemsnew[4]
itemproperty['today_mark'] = itemsnew[5]
itemproperty['total_mark'] = itemsnew[6]
item