python3.6下使用beautifulSoup4和requests抓取网页信息的记录

最新推荐文章于 2024-08-12 17:13:59 发布

原创最新推荐文章于 2024-08-12 17:13:59 发布 · 516 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#python3.6网页抓取

本文介绍了一个利用Python进行Excel数据处理并结合网页爬虫技术抓取特定信息的综合案例。通过正则表达式清洗Excel中的数据，再借助BeautifulSoup从网页上抓取与Excel数据匹配的内容，并将其整理后存入新的Excel文件中。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

# coding: utf-8
from openpyxl.reader.excel import load_workbook
from openpyxl.workbook import Workbook
from openpyxl.utils import get_column_letter
import re
from bs4 import BeautifulSoup
import requests

file = load_workbook('list.xlsx')  #读取Excel列表
sheetnames = file.sheetnames
sheet = file[sheetnames[0]]
nrows = sheet.max_row  # 行数
# print(nrows)

rowList = []
for i in range(1, nrows+1):
    rowValues = sheet.cell(i, 1).value
    # print(rowValues)
    cut = re.sub(r'\(.*?\)', '', rowValues)
    cut = cut.replace(' ', '-')
    cut = cut.replace('+', '-')
    cut = cut.rstrip('-')        # 使用正则去除多余字符
    # print(cut)
    if cut:
        rowList.append(cut)
# --------------------------------------------------- #
indexList = [‘’‘ 省略待抓取列表‘’‘]
specialList = [‘’‘ 省略待抓取列表‘’‘]
classList = indexList + specialList
# ----------------------------------------------------------------------------------------------------------- 创建 excel
workbook = Workbook()
sheet = workbook.active
# ------------------------------------------------------------------------------------------------------------- 获取网页
starRow = 3
urlList = []
k = '网址'
for row in rowList:
    url = k + row
    urlList.append(url)   # 合成网址放入列表
    sheet['A%d' % starRow].value = row     #  使用openpyxl 写入excel
    sheet['A%d' % (starRow+1)].value = url   
    starRow += 2

def regular(mal_1,mal_2):
    nor_1 = ''
    nor_2 = ''
    if mal_1:
        if mal_2:
            nor_1 = mal_1.get_text()
            nor_2 = mal_2.get_text()
            nor_2 = nor_2.replace('      ', '')
            nor_2 = nor_2.replace('    ', '')
            nor_2 = nor_2.replace('\n\n', '')
            nor_2 = nor_2.replace('\t', '')   # 使用正则去除网页抓取中的多余字符
    return nor_1, nor_2
# --------------------------------------------------- #
def excelRegular(nor_1,nor_2,link,drugnum,ccc):   # 按标题输入表格中
    if nor_1 == '省略':
        cco = 6 * (ccc - 1) + 2
        letter = get_column_letter(cco)
        sheet['%s%d' % (letter, drugnum)].value = nor_2
        sheet['%s%d' % (letter, drugnum+1)].value = link
    
# ---------------------------------------------------------------------------------------------------- 获取网页中div标签内容
def com(information, drugnn, cco):
    kk = information.find_all("div")
    for i in kk:
        link = ''
        normal_1 = i.find('div', attrs={'class': 'twocolsub'})
        normal_2 = i.find('div', attrs={'class': 'sixcolsub last'})
        links = i.find('a')
        (nor_1, nor_2) = regular(normal_1, normal_2)
        if links:
           link = links.get('href')
        if nor_1 and nor_2:
            print(nor_1, ":", nor_2)
            excelRegular(nor_1, nor_2, link, drugnn, cco)

addd = []
t3 = 1
drugNN = 3
for drugName in urlList:   #循环列表中内容
    Lactation = []
    H2_list = []
    endH2 = ''
    normal_1 = ''
    normal_2 = ''

    num = 0
    endNum = 0
    num_1 = 0
    t4 = 0

    page = requests.get(drugName)
    contents = page.text
    soup = BeautifulSoup(contents, 'lxml')   # 使用lxml解析度好于par。。忘了。。
    lactation = soup.select('div.eightcol section')
    h2_list = soup.find_all('h2')
    # ------------------------------------------------------------------------------------- 确定目标后是否还有 h2
    for i in h2_list:
        H2_list.append(re.sub(r'\<.*?\>|^\n|\n+(?=\n)|\n$', '', i.text))

    for i in H2_list:
        if i == 'Lactation Safety Information':
            num_1 = H2_list.index('Lactation Safety Information')
            t4 = num_1 + 1 # --------- t4 为后一个 h2 序号
            if t4 < len(H2_list):
                endH2 = H2_list[t4]
    # ----------------------------------------------------------------------------------------------- 查看是否有目标信息
    if soup.select('div.eightcol header h1')[0].text != 'Not Found':
        print(t3, "   --------------------", soup.select('div.eightcol header h1')[0].text, "-----------------------",
              '\n',drugName)
        # -------------------------------------------------------------------------------------- 计算目标序号 num
        for ii in lactation:
            Lactation.append(re.sub(r'\<.*?\>|^\n|\n+(?=\n)|\n$', '', ii.text))
        # print("h2 _ lactation: ", Lactation)
        for i in Lactation:
            if i == 'Lactation Safety Information':
                print("yes")
                num = Lactation.index('Lactation Safety Information')
                num += 1
            if endH2:
                if i == endH2:
                    endNum = Lactation.index(endH2)
            else:
                endNum = len(Lactation)
        total = endNum - num
        if num != 0:
            print(total, " section")
            for section in range(num, endNum):
                # ---------------------------------------------------------------------------------- 获取第 1 个 section
                information = soup.select('div.eightcol section')[section]
                # print("part: ",information)
                # --------------------------------------------------------------------- 获取第 1 个 section 的 h3 小标题
                LL = ''
                little = ''
                little_title = information.find('h3')
                little = re.sub(r'\<.*?\>', '', little_title.text)
                little = little.strip()
                # kk = information.find_all("div")
                # print("kk : ",kk)
                if little == '':  # ------------- 如果 h3 小标题为空，则为普通分类
                    print("\nlittle_title : normal ")
                    cco = 1  # ------------- 第 1 个分类
                    com(information , drugNN , cco)  #调用com输入表格中
                if little == '省略':
                    print("\nlittle_title :", little)
                    cco = 2  # ------------- 第 2 个分类
                    com(information, drugNN, cco)
                if little not in classList and little:
                    addd.append(little)  #如果没有搜索到相应标题则提醒添加
        else:
            sheet['B%d' % drugNN].value = "No Lactation"
            print("No Lactation")
    else:
        sheet['B%d' % drugNN].value = "No Information"
        print("No Information")
    
    drugNN += 2
    t3 += 1

littleList = ['省略']

for uu in indexList:
    ind = indexList.index(uu)
    ind = ind * 6 + 2
    letter = get_column_letter(ind)
    sheet['%s1' % letter].value = uu
    # print(letter,ind-1)
    for nn in littleList:
        little_ind = littleList.index(nn)
        littleInd = ind + little_ind
        littleLetter = get_column_letter(littleInd)
        sheet['%s2' % littleLetter].value = nn

workbook.save('SPS.xlsx')  #保存表格

if addd:
    print('添加新分类 : ',addd)
print("***********************************   ", "save ok", "   ************************************")

1. 使用lxml解析度高；

2.openpyxl可以写多于255列数据，需要

from openpyxl.utils import get_column_letter  将数字转为字母再定位表格位置；