#!/usr/bin/env python # coding=utf-8 # 爬取找建筑 术语 import urllib2 from bs4 import BeautifulSoup import xlwt headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} def getHtml(page): url = 'http://www.zhaojianzhu.com/shuyu?page=' + page request = urllib2.Request(url=url, headers=headers) # 模拟浏览器进行访问 response = urllib2.urlopen(request) text = response.read() return text result = [] def parseHtml(htmlContent): soup = BeautifulSoup(htmlContent, "html.parser") list = soup.find_all('th', class_='pzn ptmn pbmn') print '处理中...' for data in list: name = data.find('a', class_='list_title') desc = data.find('p', class_='mtm xi6 xs2') source = data.find('a', class_='xg2') date = data.find_all('span', class_='xg1')[-1] dlist = [] if name: dlist.append(name.text.encode('utf-8')) else: dlist.append('') if desc: dlist.append(desc.text.encode('utf-8')) else: dlist.append('') if source: dlist.append(source.text.encode('utf-8')) else: dlist.append('') if date: dlist.append(date.text.encode('utf-8')) else: dlist.append('') result.append(dlist) def writeexcel(data): book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet('test', cell_overwrite_ok=True) # 表头 sheet.write(0, 0, '名称'.decode('utf-8')) sheet.write(0, 1, '英文名称'.decode('utf-8')) sheet.write(0, 2, '解释'.decode('utf-8')) sheet.write(0, 3, '来源'.decode('utf-8')) sheet.write(0, 4, '日期'.decode('utf-8')) # 数据 for index in range(len(data)): name = data[index][0].split(' ') # 此处需要将中文字符串解码成unicode码,否则会报错 if name[0].strip(): sheet.write(index + 1, 0, name[0].decode('utf-8')) if ' '.join(name[1:len(name)]).strip(): sheet.write(index + 1, 1, ' '.join(name[1:len(name)]).decode('utf-8')) if data[index][1].strip(): sheet.write(index + 1, 2, data[index][1].decode('utf-8')) if data[index][2].strip(): sheet.write(index + 1, 3, data[index][2].decode('utf-8')) if data[index][3].strip(): sheet.write(index + 1, 4, data[index][3].decode('utf-8')) book.save(r'e:\shuyu.xls') # 在字符串前加r,声明为raw字符串,这样就不会处理其中的转义了。否则,可能会报错 if __name__ == '__main__': for i in range(1, 1001): print "第{}页".format(i) parseHtml(getHtml(str(i))) writeexcel(result)
python爬虫-爬取找建筑网站术语
最新推荐文章于 2025-02-17 12:11:42 发布