根据注释软件结果从ICTV中获取物种完整注释（python代码）-优快云博客

本文链接：https://blog.youkuaiyun.com/m0_53945548/article/details/142917620

比如病毒注释软件注释到了：

Viruses;Phixviricota;Malgrandaviricetes;Petitvirales;Microviridae;;

import os
import pandas as pd

# Change working directory
os.chdir("C:/Users/fordata/Desktop/研究生/第二个想法(16s肠型＋宏基因组功能)/第二篇病毒组/result/tax")
dir_ls = os.listdir()

# Load the ICTV reference file
ICTV_txt = "ICTV去冗余.txt"
df = pd.read_csv(ICTV_txt, sep='\t')

result_ls = []
for file in dir_ls:
    if "fetal_95" in file:
        result_ls.append(file)

# Process each file that matches "fetal_95"
for file2 in result_ls:
    with open(file2, "r") as f2, open(f"{file2}_TAX.txt", "a") as out:
        print(f"vOTU\tGenome_Composition\tRealm;Subrealm;Kingdom;Subkingdom;Phylum;Subphylum;Class;Subclass;Order;Suborder;Family;Subfamily;Genus;Subgenus", file=out)
        next(f2)  # Skip the header
        for line in f2.readlines():
            # Extract the taxonomy from the line
            Tax = line.split("\t")[-1].strip("\n")
            name = line.split("\t")[0].strip()
            last_Tax = [item for item in Tax.split(";") if item][-1]
            print(f"Searching for last_Tax: {last_Tax}")

            # Search for the row where last_Tax appears
            search_result = df.apply(lambda row: row.str.contains(last_Tax).any(), axis=1)  # Apply search across rows
            first_matching_index = search_result.idxmax() if search_result.any() else None

            # If a match is found
            if first_matching_index is not None:
                matching_row = df.loc[first_matching_index]
                print(f"Matching row found: {matching_row}")

                # Find the position of last_Tax in the row
                last_tax_index = matching_row[matching_row == last_Tax].index[0]

                # Select columns from the start to the column containing last_Tax
                columns_before_last_tax = matching_row.loc[:last_tax_index]
                Genome_Composition = matching_row.iloc[-1]
                # Output the relevant columns
                print(f"Output from start to last_Tax: {columns_before_last_tax.values}")
                print(f"{name}", end="\t", file=out)
                print(f"{Genome_Composition}", end="\t", file=out)
                for i, value in enumerate(columns_before_last_tax.values):
                    value = str(value).strip()  # 去除换行符和空格
                    if i < len(columns_before_last_tax.values) - 1:
                        print(value, end=";", file=out)
                    else:
                        print(value, end="", file=out)
                print("", file=out)  # 确保输出完毕后换行
            else:
                print("No match found.")
                print(f"{name}", end="\t", file=out)
                print("not assigned", end="\t", file=out)
                print("not assigned", file=out)