爬虫练习__1:爬取实习僧公司数据

本文是作者首次尝试爬虫的练习,目标是爬取实习僧网站上的Python实习岗位公司的数据,并将其存储到Excel表格中。使用了requests、re、BeautifulSoup和xlsxwriter等库,详细介绍了获取公司页面URL、解析数据、处理自定义字体以及存储数据的步骤。虽然动态字体问题尚未解决,但已实现了基本的爬取和存储功能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

爬虫练习__1:爬取实习僧公司数据

练习说明:

因数据分析比赛需要,所以尝试着第一次练习爬取网页数据。针对实习僧网页中Python实体岗位的数据进行爬取。并将爬取下来的数据存入Excel文件中。

涉及到的库

requests、re、BeautifulSoup、xlsxwriter

过程说明

获取每个公司所在页面的URL,然后爬取数据。数据爬取中涉及到了网页自定义字体的转码。最后将爬取到的数据存储在EXCEL表格中。

具体步骤

获取每个公司页面的URL
def geturl(url):  # 获取实习僧python具体页面的网址
    try:
        html = requests.get(url, headers=headers, timeout=30)
        html.raise_for_status()
        html.encoding = html.apparent_encoding
        response = re.findall('href="/intern/inn.*?"', html.text)  # 将搜索Python结果页面中每个具体公司页面后缀爬取下来
        for i in range(len(response)): # 将爬取下来的后缀进行清理(因正则学的不好,第一次爬取的数据有杂项)
            result = re.findall('/intern/inn_\w*', response[i])
            ht_ur = 'https://www.shixiseng.com' + str(result[0])
            urls.append(ht_ur)
    except:
        return ""
爬取公司具体数据
def gethtml(url):  # 获得实习公司的信息
    fonts = []
    html = requests.get(url, headers=headers)
    soup = BeautifulSoup(html.text, 'lxml')
    work = ['job_money',  'job_week', 'job_time', 'com-name', 'com-num', 'com-class',
            'new_job_name', 'job_position',  'job_academic']
    for i in range(0, 9):
        try:
            work[i] = soup.find(class_=work[i]).string.strip()
            if i == 8: # 在爬取完前面内容后,将组后公司对实习生提出的要求一并怕取下来
                response = soup.find(class_='job_detail')
                text = response.get_text().strip()
                work.append(text)
        except:
            work[i] = []
    for i in range(0, 3): #数字部分是自定义字体,在此做字体解析
        for ch in work[i]:
            font = font_find(ch)
            fonts.append(font)
        work[i] = ''.join(fonts)
        fonts = []
    return work
字体解析
basefonts = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
#对应的下列编码每天刷新,因刚刚接触爬虫,动态自定义字体解析不了解。所以只能做成的静态转换。
baselist = ['\uf829', '\ueabc', '\uf381', '\ue9ab', '\ue91c', '\uf004', '\ue736', '\ue5db', '\uf519', '\uf11b']
# 字体解析
def font_find(ch):  
    for font in baselist:
        if ch == font:
            ch = dictionary[font]
            break
    return ch
将爬取到的数据存入Excel表格中
#一开始使用xlwt,提示存储内容超出256行,所以换的xlsxwriter
def wr_excle(data):
    row = 0
    line = 0
    dataset = data
    file = xlsxwriter.Workbook("new_excle.xlsx")
    worksheet = file.add_worksheet('sheet1')
    tb_head = [u'job_money',  'job_week', 'job_time', 'com-name', 'com-num', 'com-class', 'new_job_name', 'job_position'
               , 'job_academic', 'job_introduce']
    for con in tb_head: # 将第一行的标题写入表格
        worksheet.write(0, line, con)
        line = line+1
    line = 0
    for company in dataset: 将爬取到的数据逐一写入表格
        for content in company:
            try:
                worksheet.write(row+1, line, content)
                line = line+1
            except:
                worksheet.write(row+1, line, 'None')
        row = row+1
        line = 0
    file.close()

完整代码

import requests
import re
from bs4 import BeautifulSoup
import xlsxwriter

urls = []
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
                         "(KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"}
basefonts = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
baselist = ['\uf829', '\ueabc', '\uf381', '\ue9ab', '\ue91c', '\uf004', '\ue736', '\ue5db', '\uf519', '\uf11b']
dictionary = dict(zip(baselist, basefonts))


def geturl(url):  # 获取实习僧python具体页面的网址
    try:
        html = requests.get(url, headers=headers, timeout=30)
        html.raise_for_status()
        html.encoding = html.apparent_encoding
        response = re.findall('href="/intern/inn.*?"', html.text)
        for i in range(len(response)):
            result = re.findall('/intern/inn_\w*', response[i])
            ht_ur = 'https://www.shixiseng.com' + str(result[0])
            urls.append(ht_ur)
    except:
        return ""


def font_find(ch):  # 字体反爬
    for font in baselist:
        if ch == font:
            ch = dictionary[font]
            break
    return ch


def gethtml(url):  # 获得实习公司的信息
    fonts = []
    html = requests.get(url, headers=headers)
    soup = BeautifulSoup(html.text, 'lxml')
    work = ['job_money',  'job_week', 'job_time', 'com-name', 'com-num', 'com-class',
            'new_job_name', 'job_position',  'job_academic']
    for i in range(0, 9):
        try:
            work[i] = soup.find(class_=work[i]).string.strip()
            if i == 8:
                response = soup.find(class_='job_detail')
                text = response.get_text().strip()
                work.append(text)
        except:
            work[i] = []
    for i in range(0, 3):
        for ch in work[i]:
            font = font_find(ch)
            fonts.append(font)
        work[i] = ''.join(fonts)
        fonts = []
    return work


def wr_excle(data):
    row = 0
    line = 0
    dataset = data
    file = xlsxwriter.Workbook("new_excle.xlsx")
    worksheet = file.add_worksheet('sheet1')
    tb_head = [u'job_money',  'job_week', 'job_time', 'com-name', 'com-num', 'com-class', 'new_job_name', 'job_position'
               , 'job_academic', 'job_introduce']
    for con in tb_head:
        worksheet.write(0, line, con)
        line = line+1
    line = 0
    for company in dataset:
        for content in company:
            try:
                worksheet.write(row+1, line, content)
                line = line+1
            except:
                worksheet.write(row+1, line, 'None')
        row = row+1
        line = 0
    file.close()


def main():
    datas = []
    for i in range(1, 2):
        url = 'https://www.shixiseng.com/interns/c-None_?k=python&p='+str(i)
        geturl(url)
    for url in urls:
        data = gethtml(url)
        datas.append(data)
    wr_excle(datas)


if __name__ == '__main__':
    main()

[^1] :刚刚接触爬虫有很多不懂的地方,如果代码部分有不对的地方,请见谅。
[^2] :代码中部分功能实现参考自网上说明以及《Python3网络爬虫开发实践》。
[^3] :动态字体部分仍旧未能解决。所以该部分代码仅做现阶段保存。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值