比如病毒注释软件注释到了:
Viruses;Phixviricota;Malgrandaviricetes;Petitvirales;Microviridae;;
import os
import pandas as pd
# Change working directory
os.chdir("C:/Users/fordata/Desktop/研究生/第二个想法(16s肠型+宏基因组功能)/第二篇病毒组/result/tax")
dir_ls = os.listdir()
# Load the ICTV reference file
ICTV_txt = "ICTV去冗余.txt"
df = pd.read_csv(ICTV_txt, sep='\t')
result_ls = []
for file in dir_ls:
if "fetal_95" in file:
result_ls.append(file)
# Process each file that matches "fetal_95"
for file2 in result_ls:
with open(file2, "r") as f2, open(f"{file2}_TAX.txt", "a") as out:
print(f"vOTU\tGenome_Composition\tRealm;Subrealm;Kingdom;Subkingdom;Phylum;Subphylum;Class;Subclass;Order;Suborder;Family;Subfamily;Genus;Subgenus", file=out)
next(f2) # Skip the header
for line in f2.readlines():
# Extract the taxonomy from the line
Tax = line.split("\t")[-1].strip("\n")
name = line.split("\t")[0].strip()
last_Tax = [item for item in Tax.split(";") if item][-1]
print(f"Searching for last_Tax: {last_Tax}")
# Search for the row where last_Tax appears
search_result = df.apply(lambda row: row.str.contains(last_Tax).any(), axis=1) # Apply search across rows
first_matching_index = search_result.idxmax() if search_result.any() else None
# If a match is found
if first_matching_index is not None:
matching_row = df.loc[first_matching_index]
print(f"Matching row found: {matching_row}")
# Find the position of last_Tax in the row
last_tax_index = matching_row[matching_row == last_Tax].index[0]
# Select columns from the start to the column containing last_Tax
columns_before_last_tax = matching_row.loc[:last_tax_index]
Genome_Composition = matching_row.iloc[-1]
# Output the relevant columns
print(f"Output from start to last_Tax: {columns_before_last_tax.values}")
print(f"{name}", end="\t", file=out)
print(f"{Genome_Composition}", end="\t", file=out)
for i, value in enumerate(columns_before_last_tax.values):
value = str(value).strip() # 去除换行符和空格
if i < len(columns_before_last_tax.values) - 1:
print(value, end=";", file=out)
else:
print(value, end="", file=out)
print("", file=out) # 确保输出完毕后换行
else:
print("No match found.")
print(f"{name}", end="\t", file=out)
print("not assigned", end="\t", file=out)
print("not assigned", file=out)