import os
from selenium import webdriver
import time
import re
browser = webdriver.Chrome() # 全局变量 浏览器
def click_event():
"""
所有点击事件
返回 一个菌株的polygons_name 、AA_seqs 待写入文本中
"""
polygons_name = [] # 用于一个菌株存储所有核心基因名称
AA_seqs = [] # 用于存储一个菌株所有核心基因序列
# 所有rebutton按钮存储到列表中,待循环点击
regbutton_list=browser.find_elements_by_css_selector('div[style="display: flex; flex-wrap: wrap"]>div')
for regbutton in regbutton_list:
regbutton.click() # 点击单个区域
# get_heading()
polygon_name,AA_seq = click_core() # 点击核心基因polygon,并获取AA序列及其名称
# 将区域核心基因名称整理好存入polygons_name
i=2
for polygon in polygon_name:
if polygon in polygons_name:
polygon = polygon+str(i)
polygons_name.append(polygon)
i +=1
else:
polygons_name.append(polygon)
# 将区域核心基因序列整理好入AA_seq
for AA in AA_seq:
AA_seqs.append(AA)
time.sleep(2)
# get_aaseq()
# print(regbutton_list)
# print(polygons_name)
# print(AA_seqs)
return polygons_name,AA_seqs
def click_core():
# 点击指定区域的所有核心基因,返回此区域所有核心基因名称和序列内容
core_list = browser.find_elements_by_css_selector('polygon[class="svgene-type-biosynthetic svgene-orf svgene-selected-orf"]')
# head_list = get_heading()
# print(head_list)
polygon_name = [] #单个区域所有核心基因的名称
AA_seq = [] # 单个区域所有核心基因的序列
for polygon in core_list:
polygon.click() # 点击核心基因
# 获取区域名称,待给所有核心基因序列取名
head_list = get_heading()
for head in head_list:
if head !="":
polygon_name.append(head)
# 获取核心基因序列
aaseq = get_aaseq()[0] # 单个核心基因序列
AA_seq.append(aaseq)
time.sleep(1) # 休息1s点击下一个polygon(核心基因一般一个或两个)
# print(polygon_name)
# print(AA_seq)
return polygon_name,AA_seq
def get_heading():
"""
获取单个polybutton(区域)的名字 ,一个区域可能有多个核心基因
<div class="page" id="r10c1" style="display: block;">
<div class="region-grid">
<div class="content">
<div class="description-container">
<div class="heading">
div[style="display: block;"]
'page>region-grid>content>description-container>heading'
"""
head_list = []
heading = browser.find_elements_by_css_selector('.page>.region-grid>.content>.description-container>.heading')
for head in heading:
a= head.text
a = a.replace(" - ","|")
a = a.replace(" ","")
head_list.append(a)
# print(heading.text)
return head_list
def get_aaseq():
aaseq = []
# 获取AA序列 .page>.region-grid>.focus-panel>.focus-panel-content focus-panel-content-r1c1>.focus-clipboard>.clipboard-copy
# copy = browser.find_elements_by_css_selector('div[class="focus-clipboard"]>span[class="clipboard-copy"]')
copy = browser.find_elements_by_css_selector('div[style=""]>.region-grid>.focus-panel div[class="focus-clipboard"]')
# print(copy)
for c in copy:
copy_html = c.get_attribute('outerHTML') # 获得复制AA序列html文本内容,待正则表达式提取属性值
# print(copy_html)
data_rule = re.compile(' AA sequence: <span class="clipboard-copy" data-seq="(.*)"') # 正则表达式规则 还可以选择核酸序列
data_seq = re.findall(data_rule, copy_html)[0] # AA序列内容
aaseq.append(data_seq)
# print(data_seq)
return aaseq
def sele_html(all_path,file_list):
"""
selenium自动化操作本地所有html,并将结果写入文本
"""
for file_name in file_list:
url = "file:///"+all_path + file_name + "/index.html"
browser.get(url)
time.sleep(2) # 加载页面休息2s
polygons_name,AA_seqs=click_event() # 点击事件,获取此菌株所有核心基因名称及序列
file_path = "文件存储路径"+file_name+".fasta" ###### 文件存入路径 ######
# file_path = file_name + ".fasta"
file = open(file_path,"w+")
for i in range(len(polygons_name)):
file.write(">"+polygons_name[i]+"\n")
file.write(AA_seqs[i]+'\n')
print(file_name+" 写入完毕!")
file.close()
browser.quit()
def main():
all_path = "antismash结果路径" #### 修改此处antismash结果目录路径即可 ######
# all_path = "E:\\1A生信数据
file_list = os.listdir(all_path)
sele_html(all_path,file_list)
if __name__ == '__main__':
main()
selenium批量提取antiSMASH结果中的核心基因AA序列
于 2021-01-29 13:23:05 首次发布