# -*- coding: utf-8 -*-
import urllib2
import re
import BeautifulSoup
import json
def getPage(url):#获取页面信息
flag = True
while flag: #该while循环为了防止意外断网
try:
print url
request = urllib2.Request(url)
response = urllib2.urlopen(request)
page = response.read()
flag = False
return page
except Exception, e:
if hasattr(e,"reason"):
print u"连接失败,错误原因",e.reason
flag = True
def getMaxPageNum(page):#获取 医院/科室/医生 页面数最大值
soup = BeautifulSoup.BeautifulSoup(page)
#print page
res = soup.findAll('span',attrs={'class':'contcss'})
#print res
return int(re.findall(r"\d+",res[0].text.split("/")[0])[0].encode("utf-8"))
def dumpDocInfo(url):#将医生数据写到本地
page = getPage(url)
outfile = open("C:\\Users\\Administrator\\Desktop\\doctors.txt","ab")
soup = BeautifulSoup.BeautifulSoup(page)
if len(soup.findAll(attrs={'class':'map'})) == 0:
return
xingming = soup.findAll(attrs={'class':'map'})[0].findAll('a')[2].text
zhicheng = soup.findAll(attrs={'class':'regdoc_name'})[0].findAll('span')[0].text
yiyuan = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[0][3:]
keshi = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[1]
jianjie = soup.findAll(attrs={'class':'regdoc_msg'})[0].text[3:]
shanchang = soup.findAll(attrs={'class':'regdoc_msg'})[1].text[3:]
info = {u'姓名':xingming,u'职称':zhicheng,u'医院':yiyuan,u'科室':keshi,u'简介':jianjie,u'擅长':shanchang}
info = json.dumps(info, ensure_ascii=False)
outfile.write(info.encode('utf-8')+'\n')
def dumpHisInfo(url):#将医生数据写到本地
page = getPage(url)
outfile = open("C:\\Users\\Administrator\\Desktop\\hospital.txt","ab")
soup = BeautifulSoup.BeautifulSoup(page)
if len(soup.findAll(attrs={'class':'map'})) == 0:
return
xingming = soup.findAll(attrs={'class':'map'})[0].findAll('a')[2].text
zhicheng = soup.findAll(attrs={'class':'regdoc_name'})[0].findAll('span')[0].text
yiyuan = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[0][3:]
keshi = soup.findAll(attrs={'class':'regdoc_commom'})[1].text.split(u'科室:')[1]
jianjie = soup.findAll(attrs={'class':'regdoc_msg'})[0].text[3:]
shanchang = soup.findAll(attrs={'class':'regdoc_msg'})[1].text[3:]
info = {u'姓名':xingming,u'职称':zhicheng,u'医院':yiyuan,u'科室':keshi,u'简介':jianjie,u'擅长':shanchang}
info = json.dumps(info, ensure_ascii=False)
outfile.write(info.encode('utf-8')+'\n')
def findDocHref(page):#获取所有医生信息的超链接
soup = BeautifulSoup.BeautifulSoup(page)
res = soup.findAll('div',attrs={'class':'yy_doctor_head'})
for v in res:
dumpDocInfo('http://www.eztcn.com'+v['onclick'].encode("utf-8")[15:-1])
def findHisHref(page):#查找所有医院的超链接
soup = BeautifulSoup.BeautifulSoup(page)
res = soup.findAll('a')
arr = []
for v in res[9::3]:
if '#' in str(v): #判断一页是否结束
break
arr.append("http://www.eztcn.com"+(v['href'].encode("utf-8")))
return arr
#主程序正式开始
hisMaxPageNum = getMaxPageNum(getPage('http://www.eztcn.com/Home/Find/findHos/p/1.html#selAnchor'))
print "hisMaxPageNum:"+str(hisMaxPageNum)
#医院页面数为15页
for i in range(1,hisMaxPageNum+1):
url = 'http://www.eztcn.com/Home/Find/findHos/p/'+str(i)+'.html#selAnchor'
page = getPage(url)
hisHref = findHisHref(page)
for hisUrl in hisHref:
page = getPage(hisUrl)
docMaxPageNum = getMaxPageNum(page)
dumpHisInfo(hisUrl)
print "docMaxPageNum:"+str(docMaxPageNum)
for i in range(1,docMaxPageNum+1):
url2 = hisUrl[:-5]+'/cd/2016-04-29/p/'+str(i)+'.html#headAnchor'
page2 = getPage(url2)
findDocHref(page2)
python爬虫的一个小例子
最新推荐文章于 2025-06-04 17:40:00 发布