采集数据-优快云博客

本文链接：https://blog.youkuaiyun.com/pycdp/article/details/106815256

数据采集

网页采集数据：打开链接（模仿请求头和表单）----->获取分析html-----> 清洗提取信息

正则

符号	含义	符号	含义
.	匹配换行符以外的任何字符	\	转义
^	匹配字符串的开头	$	匹配字符串的末尾
*	匹配前一正则的0+个	+	匹配前一正则的1+个
?	匹配前一正则0或1个	{m}	匹配前一正则m个
{m,n}	匹配前一正则m-n个	[1,9]、[1-9]	分别匹配1或9和1-9

xpath

html.fromstring(path).xpath(" ")
etree.tostring(path).xpath(" ")
1、//* 列表；//p 元素列表；//text() 文本列表；
2、获取a标签的内容：‘//div/ul/li/a’ a.text()或者‘//div/ul/li/a/text()/’
3、获取a标签的属性：‘//div/ul/li/a/@href’
4、定位(属性、倒数位置)：”//a[@id=“kw”]“；’//li[last()]/’；’//li[last()-1]’

1.相关包的导入

import urllib.request
from io import BytesIO
import chardet
from lxml import html
import gzip  # 解析压缩的html

2.打开链接

def open_text(path1):
    """打开链接"""
    path1 = path1
    request = urllib.request.Request(path1,headers={"Accept-Encoding":"gzip"})
    response = urllib.request.urlopen(request)
    data = response.read()
    buff = BytesIO(data)
    f = gzip.GzipFile(fileobj=buff)
    html_data = f.read().decode('utf-8')
    return html_data

3.清洗、采集数据

def job_it(path1):
    """获取招聘信息，并存为dataframe"""
    buff_three = open_text(path1)
    sel_three = html.fromstring(buff_three)
    dct1={}
    try:
        # 由岗位信息页面跳转公司信息页面，采集公司信息
        url_four = "https://m.yingjiesheng.com" + \
                   sel_three.xpath('//section[@class="jobdetail"]/p/a[@class="green"]/@href')[0].lstrip(".")
        buff_four = open_text(url_four)
        sel_four = html.fromstring(buff_four)
        employments1 = [x.strip() for x in sel_four.xpath('//section[@class="jobdetail"]/p/text()') if x.strip() != " "]

        # 在岗位页面采集岗位信息
        employments2 = [x.strip() for x in sel_three.xpath('//section[@class="jobdetail"]/h1/text() | \
                                                            //section[@class="jobdetail"]/p/text() | \
                                                            //section[@class="jobdetail"]/p/a/text()') if x.strip() != " "]
        # print(buff_three)
        # 采集的信息存入dataframe
        dct1 = {"工作地点": [employments2[8]],
                "企业规模": [employments1[3]],
                "有效日期": [employments2[6]],
                "职位名称": [employments2[0]],
                "招聘人数": [employments2[12]],
                "所属行业": [employments1[1]],
                "公司名称": [employments2[2]],
                "企业性质": [employments1[5]],
                "职业描述": [" ".join(employments2[14:-1])],
                "职位性质": [employments2[4]]}

    except Exception:
        pass
    import pandas as pd
    return pd.DataFrame(dct1)

4.写入excel文件


def write_excel(df,file):
    """写入表格"""
    if df.index == 0:
        # 写入列名（第一次写入）
        df.to_csv(file, mode="a", encoding="utf_8_sig")
    else:
        # 不写入列名
        df.to_csv(file, mode="a", encoding="utf_8_sig", header=0)

5.开始采集


def company_links(path1,num,file,cumulative_entries):
    """
    打开地区链接，获取地区岗位页面，并采集信息
    path1:路径，num:读取信息的页数，file:待写入的表格文件
    1.依次打开次目标层页面，获取目标层（招聘信息）页面的链接；
    2.依次打开获取目标层（招聘信息）页面的链接，获取招聘信息；
    3.将招聘信息写入表格文件
    """
    path_region = path1
    new_index = cumulative_entries
    for _ in range(int(num)):
        url_two = "https://m.yingjiesheng.com"+path_region
        buff_two = open_text(url_two)
        sel_two = html.fromstring(buff_two)
        xyy = sel_two.xpath('//div[@class="page"]/a/@href') # 解析  下一页
        link_company = sel_two.xpath('//ul[@class="list link_visit"]/li/a/@href') # 解析   岗位链接

        for k, v in enumerate(link_company):
            # 依次采集发布的各个岗位信息
            url_three = "https://m.yingjiesheng.com" + v
            df = job_it(url_three)
            if len(df) == 0:
                continue
            new_index += 1
            df.rename({0: new_index}, inplace=True)  # 修改序列为爬取的条目序数
            write_excel(df, file)
        path_region = "/"+path_region.split("/")[1]+"/"+xyy[0] # 下一页路径
    return new_index

# 打开链接首页
url_index = "https://m.yingjiesheng.com/?SslSign=Y"
buff_index = open_text(url_index)
# 在首页页面获取各地区分类页面的链接
sel_index = html.fromstring(buff_index)
link_region = sel_index.xpath('//ul[@class="nav_hotcity"]/li/a/@href')
cumulative_entries = 0 # 累计条目：
for i in link_region[0:-1]:
	# 打开各地区分类页面,并获取招聘信息
    en = company_links(i,2,'aa.csv',cumulative_entries)
    cumulative_entries += en