爬虫练习__1:爬取实习僧公司数据
练习说明:
因数据分析比赛需要,所以尝试着第一次练习爬取网页数据。针对实习僧网页中Python实体岗位的数据进行爬取。并将爬取下来的数据存入Excel文件中。
涉及到的库
requests、re、BeautifulSoup、xlsxwriter
过程说明
获取每个公司所在页面的URL,然后爬取数据。数据爬取中涉及到了网页自定义字体的转码。最后将爬取到的数据存储在EXCEL表格中。
具体步骤
获取每个公司页面的URL
def geturl(url): # 获取实习僧python具体页面的网址
try:
html = requests.get(url, headers=headers, timeout=30)
html.raise_for_status()
html.encoding = html.apparent_encoding
response = re.findall('href="/intern/inn.*?"', html.text) # 将搜索Python结果页面中每个具体公司页面后缀爬取下来
for i in range(len(response)): # 将爬取下来的后缀进行清理(因正则学的不好,第一次爬取的数据有杂项)
result = re.findall('/intern/inn_\w*', response[i])
ht_ur = 'https://www.shixiseng.com' + str(result[0])
urls.append(ht_ur)
except:
return ""
爬取公司具体数据
def gethtml(url): # 获得实习公司的信息
fonts = []
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
work = ['job_money', 'job_week', 'job_time', 'com-name', 'com-num', 'com-class',
'new_job_name', 'job_position', 'job_academic']
for i in range(0, 9):
try:
work[i] = soup.find(class_=work[i]).string.strip()
if i == 8: # 在爬取完前面内容后,将组后公司对实习生提出的要求一并怕取下来
response = soup.find(class_='job_detail')
text = response.get_text().strip()
work.append(text)
except:
work[i] = []
for i in range(0, 3): #数字部分是自定义字体,在此做字体解析
for ch in work[i]:
font = font_find(ch)
fonts.append(font)
work[i] = ''.join(fonts)
fonts = []
return work
字体解析
basefonts = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
#对应的下列编码每天刷新,因刚刚接触爬虫,动态自定义字体解析不了解。所以只能做成的静态转换。
baselist = ['\uf829', '\ueabc', '\uf381', '\ue9ab', '\ue91c', '\uf004', '\ue736', '\ue5db', '\uf519', '\uf11b']
# 字体解析
def font_find(ch):
for font in baselist:
if ch == font:
ch = dictionary[font]
break
return ch
将爬取到的数据存入Excel表格中
#一开始使用xlwt,提示存储内容超出256行,所以换的xlsxwriter
def wr_excle(data):
row = 0
line = 0
dataset = data
file = xlsxwriter.Workbook("new_excle.xlsx")
worksheet = file.add_worksheet('sheet1')
tb_head = [u'job_money', 'job_week', 'job_time', 'com-name', 'com-num', 'com-class', 'new_job_name', 'job_position'
, 'job_academic', 'job_introduce']
for con in tb_head: # 将第一行的标题写入表格
worksheet.write(0, line, con)
line = line+1
line = 0
for company in dataset: 将爬取到的数据逐一写入表格
for content in company:
try:
worksheet.write(row+1, line, content)
line = line+1
except:
worksheet.write(row+1, line, 'None')
row = row+1
line = 0
file.close()
完整代码
import requests
import re
from bs4 import BeautifulSoup
import xlsxwriter
urls = []
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"}
basefonts = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
baselist = ['\uf829', '\ueabc', '\uf381', '\ue9ab', '\ue91c', '\uf004', '\ue736', '\ue5db', '\uf519', '\uf11b']
dictionary = dict(zip(baselist, basefonts))
def geturl(url): # 获取实习僧python具体页面的网址
try:
html = requests.get(url, headers=headers, timeout=30)
html.raise_for_status()
html.encoding = html.apparent_encoding
response = re.findall('href="/intern/inn.*?"', html.text)
for i in range(len(response)):
result = re.findall('/intern/inn_\w*', response[i])
ht_ur = 'https://www.shixiseng.com' + str(result[0])
urls.append(ht_ur)
except:
return ""
def font_find(ch): # 字体反爬
for font in baselist:
if ch == font:
ch = dictionary[font]
break
return ch
def gethtml(url): # 获得实习公司的信息
fonts = []
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
work = ['job_money', 'job_week', 'job_time', 'com-name', 'com-num', 'com-class',
'new_job_name', 'job_position', 'job_academic']
for i in range(0, 9):
try:
work[i] = soup.find(class_=work[i]).string.strip()
if i == 8:
response = soup.find(class_='job_detail')
text = response.get_text().strip()
work.append(text)
except:
work[i] = []
for i in range(0, 3):
for ch in work[i]:
font = font_find(ch)
fonts.append(font)
work[i] = ''.join(fonts)
fonts = []
return work
def wr_excle(data):
row = 0
line = 0
dataset = data
file = xlsxwriter.Workbook("new_excle.xlsx")
worksheet = file.add_worksheet('sheet1')
tb_head = [u'job_money', 'job_week', 'job_time', 'com-name', 'com-num', 'com-class', 'new_job_name', 'job_position'
, 'job_academic', 'job_introduce']
for con in tb_head:
worksheet.write(0, line, con)
line = line+1
line = 0
for company in dataset:
for content in company:
try:
worksheet.write(row+1, line, content)
line = line+1
except:
worksheet.write(row+1, line, 'None')
row = row+1
line = 0
file.close()
def main():
datas = []
for i in range(1, 2):
url = 'https://www.shixiseng.com/interns/c-None_?k=python&p='+str(i)
geturl(url)
for url in urls:
data = gethtml(url)
datas.append(data)
wr_excle(datas)
if __name__ == '__main__':
main()
[^1] :刚刚接触爬虫有很多不懂的地方,如果代码部分有不对的地方,请见谅。
[^2] :代码中部分功能实现参考自网上说明以及《Python3网络爬虫开发实践》。
[^3] :动态字体部分仍旧未能解决。所以该部分代码仅做现阶段保存。