# coding: utf-8
from openpyxl.reader.excel import load_workbook
from openpyxl.workbook import Workbook
from openpyxl.utils import get_column_letter
import re
from bs4 import BeautifulSoup
import requests
file = load_workbook('list.xlsx') #读取Excel列表
sheetnames = file.sheetnames
sheet = file[sheetnames[0]]
nrows = sheet.max_row # 行数
# print(nrows)
rowList = []
for i in range(1, nrows+1):
rowValues = sheet.cell(i, 1).value
# print(rowValues)
cut = re.sub(r'\(.*?\)', '', rowValues)
cut = cut.replace(' ', '-')
cut = cut.replace('+', '-')
cut = cut.rstrip('-') # 使用正则去除多余字符
# print(cut)
if cut:
rowList.append(cut)
# --------------------------------------------------- #
indexList = [‘’‘ 省略待抓取列表‘’‘]
specialList = [‘’‘ 省略待抓取列表‘’‘]
classList = indexList + specialList
# ----------------------------------------------------------------------------------------------------------- 创建 excel
workbook = Workbook()
sheet = workbook.active
# ------------------------------------------------------------------------------------------------------------- 获取网页
starRow = 3
urlList = []
k = '网址'
for row in rowList:
url = k + row
urlList.append(url) # 合成网址放入列表
sheet['A%d' % starRow].value = row # 使用openpyxl 写入excel
sheet['A%d' % (starRow+1)].value = url
starRow += 2
def regular(mal_1,mal_2):
nor_1 = ''
nor_2 = ''
if mal_1:
if mal_2:
nor_1 = mal_1.get_text()
nor_2 = mal_2.get_text()
nor_2 = nor_2.replace(' ', '')
nor_2 = nor_2.replace(' ', '')
nor_2 = nor_2.replace('\n\n', '')
nor_2 = nor_2.replace('\t', '') # 使用正则去除网页抓取中的多余字符
return nor_1, nor_2
# --------------------------------------------------- #
def excelRegular(nor_1,nor_2,link,drugnum,ccc): # 按标题输入表格中
if nor_1 == '省略':
cco = 6 * (ccc - 1) + 2
letter = get_column_letter(cco)
sheet['%s%d' % (letter, drugnum)].value = nor_2
sheet['%s%d' % (letter, drugnum+1)].value = link
# ---------------------------------------------------------------------------------------------------- 获取网页中div标签内容
def com(information, drugnn, cco):
kk = information.find_all("div")
for i in kk:
link = ''
normal_1 = i.find('div', attrs={'class': 'twocolsub'})
normal_2 = i.find('div', attrs={'class': 'sixcolsub last'})
links = i.find('a')
(nor_1, nor_2) = regular(normal_1, normal_2)
if links:
link = links.get('href')
if nor_1 and nor_2:
print(nor_1, ":", nor_2)
excelRegular(nor_1, nor_2, link, drugnn, cco)
addd = []
t3 = 1
drugNN = 3
for drugName in urlList: #循环列表中内容
Lactation = []
H2_list = []
endH2 = ''
normal_1 = ''
normal_2 = ''
num = 0
endNum = 0
num_1 = 0
t4 = 0
page = requests.get(drugName)
contents = page.text
soup = BeautifulSoup(contents, 'lxml') # 使用lxml解析度好于par。。忘了。。
lactation = soup.select('div.eightcol section')
h2_list = soup.find_all('h2')
# ------------------------------------------------------------------------------------- 确定目标后是否还有 h2
for i in h2_list:
H2_list.append(re.sub(r'\<.*?\>|^\n|\n+(?=\n)|\n$', '', i.text))
for i in H2_list:
if i == 'Lactation Safety Information':
num_1 = H2_list.index('Lactation Safety Information')
t4 = num_1 + 1 # --------- t4 为后一个 h2 序号
if t4 < len(H2_list):
endH2 = H2_list[t4]
# ----------------------------------------------------------------------------------------------- 查看是否有目标信息
if soup.select('div.eightcol header h1')[0].text != 'Not Found':
print(t3, " --------------------", soup.select('div.eightcol header h1')[0].text, "-----------------------",
'\n',drugName)
# -------------------------------------------------------------------------------------- 计算目标序号 num
for ii in lactation:
Lactation.append(re.sub(r'\<.*?\>|^\n|\n+(?=\n)|\n$', '', ii.text))
# print("h2 _ lactation: ", Lactation)
for i in Lactation:
if i == 'Lactation Safety Information':
print("yes")
num = Lactation.index('Lactation Safety Information')
num += 1
if endH2:
if i == endH2:
endNum = Lactation.index(endH2)
else:
endNum = len(Lactation)
total = endNum - num
if num != 0:
print(total, " section")
for section in range(num, endNum):
# ---------------------------------------------------------------------------------- 获取第 1 个 section
information = soup.select('div.eightcol section')[section]
# print("part: ",information)
# --------------------------------------------------------------------- 获取第 1 个 section 的 h3 小标题
LL = ''
little = ''
little_title = information.find('h3')
little = re.sub(r'\<.*?\>', '', little_title.text)
little = little.strip()
# kk = information.find_all("div")
# print("kk : ",kk)
if little == '': # ------------- 如果 h3 小标题为空,则为普通分类
print("\nlittle_title : normal ")
cco = 1 # ------------- 第 1 个分类
com(information , drugNN , cco) #调用com输入表格中
if little == '省略':
print("\nlittle_title :", little)
cco = 2 # ------------- 第 2 个分类
com(information, drugNN, cco)
if little not in classList and little:
addd.append(little) #如果没有搜索到相应标题则提醒添加
else:
sheet['B%d' % drugNN].value = "No Lactation"
print("No Lactation")
else:
sheet['B%d' % drugNN].value = "No Information"
print("No Information")
drugNN += 2
t3 += 1
littleList = ['省略']
for uu in indexList:
ind = indexList.index(uu)
ind = ind * 6 + 2
letter = get_column_letter(ind)
sheet['%s1' % letter].value = uu
# print(letter,ind-1)
for nn in littleList:
little_ind = littleList.index(nn)
littleInd = ind + little_ind
littleLetter = get_column_letter(littleInd)
sheet['%s2' % littleLetter].value = nn
workbook.save('SPS.xlsx') #保存表格
if addd:
print('添加新分类 : ',addd)
print("*********************************** ", "save ok", " ************************************")
1. 使用lxml解析度高;
2.openpyxl可以写多于255列数据,需要
from openpyxl.utils import get_column_letter 将数字转为字母再定位表格位置;