import cobra
import pandas as pd
import numpy as np
import os, sys, argparse, logging
from cobra.flux_analysis import flux_variability_analysis, single_gene_deletion
# RIPTiDe
import riptide
# Statistics
from sklearn.manifold import MDS
from skbio.diversity import beta_diversity
from skbio.stats.distance import permanova
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
# =========================
# Logging
# =========================
def setup_logger(outdir):
log_file = os.path.join(outdir, "pipeline.log")
logging.basicConfig(level=logging.INFO,
format="%(asctime)s [%(levelname)s %(message)s",
handlers=[logging.FileHandler(log_file),
logging.StreamHandler(sys.stdout)])
return log_file
# =========================
# Core helpers
# =========================
def read_sbml(path):
model = cobra.io.read_sbml_model(path)
logging.info(f"✅ Model loaded: {model.id}, Reactions={len(model.reactions)}, "
f"Metabolites={len(model.metabolites)}, Genes={len(model.genes)}")
return model
def find_biomass_reaction(model):
biomass_rxns = [r for r in model.reactions if "biomass" in r.id.lower() or "biomass" in r.name.lower()]
if biomass_rxns:
biomass = biomass_rxns[0]
model.objective = biomass
logging.info(f"✅ Biomass objective set: {biomass.id}")
return biomass
else:
logging.warning("⚠️ No biomass reaction detected automatically")
return None
def find_atpm_reaction(model):
candidates = [rid for rid in ["ATPM","R_ATPM","DM_atp_c_"] if rid in model.reactions]
if candidates:
rxn = model.reactions.get_by_id(candidates[0])
logging.info(f"ATPM reaction {rxn.id}: bounds {rxn.lower_bound},{rxn.upper_bound}")
return rxn
else:
logging.warning("⚠️ No ATPM reaction detected")
return None
# =========================
# Media: S4
# =========================
def load_s4_gcb(path_s4):
xls = pd.ExcelFile(path_s4)
df = xls.parse("In Silico")
out = df[["Reaction ID","LB (Equally Scaled)","LB (Molarity Scaled)"]].copy()
out.columns = ["rxn_id","equally","molarity"]
return out
def build_media_from_s4(model, s4_df, scaling="equally",
oxygen_override=None, log_prefix="S4"):
lbcol = "equally" if scaling=="equally" else "molarity"
applied = {}
for _, row in s4_df.iterrows():
rid = row["rxn_id"]
if rid not in model.reactions: continue
if "_e" not in rid: continue
rxn = model.reactions.get_by_id(rid)
val = row[lbcol]
if pd.isna(val):
continue
rxn.lower_bound = -float(val)
rxn.upper_bound = 1000.0
applied[rid] = rxn.lower_bound
# oxygen special
if oxygen_override is not None and "EX_o2_e_" in model.reactions:
model.reactions.EX_o2_e_.lower_bound = -float(oxygen_override)
applied["EX_o2_e_"] = -float(oxygen_override)
logging.info(f"{log_prefix} medium applied with {len(applied)} exchanges set")
return applied
# =========================
# Media: Custom GC broth + Vitox
# =========================
def build_media_gc_vitox(model, scaling="molarity", oxygen_override=None):
"""
Custom GC broth + Vitox composition
Values taken from proteose peptone analysis (mM) and Vitox supplement working concentrations.
"""
# Amino acids from proteose peptone (mM)
base = {
"EX_gly_e_":12.99, "EX_ala_L_e_":8.75, "EX_glu_L_e_":8.16, "EX_leu_L_e_":6.40,
"EX_asp_L_e_":5.75, "EX_val_L_e_":4.48, "EX_pro_L_e_":4.95, "EX_lys_L_e_":4.31,
"EX_arg_L_e_":3.70, "EX_ile_L_e_":3.66, "EX_phe_L_e_":3.18, "EX_thr_L_e_":1.89,
"EX_ser_L_e_":2.28, "EX_his_L_e_":1.26, "EX_met_L_e_":1.31, "EX_tyr_L_e_":1.32,
"EX_cys_L_e_":0.37, "EX_asn_L_e_":0.34, "EX_trp_L_e_":0.22
}
# Ions (mM)
base.update({
"EX_na1_e_":24.87,"EX_cl_e_":10.75,"EX_pi_e_":2.38,"EX_k_e_":5.05,"EX_so4_e_":0.58,
"EX_mg2_e_":0.064,"EX_ca2_e_":0.049,"EX_fe3_e_":0.006
})
# Extra carbon source from starch (as glucose)
base["EX_glc_D_e_"] = 2.8
# Vitox additions (approx mM)
vitox = {
"EX_cbl1_e_":0.000074, "EX_ade_e_":0.037, "EX_gln_L_e_":0.684, "EX_gua_e_":0.020,
"EX_paba_e_":0.009, "EX_cyst_L_e_":0.092, "EX_nad_e_":0.003, "EX_thmpp_e_":0.003,
"EX_fe3_e_":0.036, "EX_thm_e_":0.089, "EX_cys_L_e_":1.54, "EX_glc_D_e_":5.56
}
# Merge
media = {**base, **vitox}
applied = {}
for ex_id,val in media.items():
if ex_id not in model.reactions:
continue
rxn = model.reactions.get_by_id(ex_id)
rxn.lower_bound = -float(val)
rxn.upper_bound = 1000.0
applied[ex_id] = -float(val)
if oxygen_override is not None and "EX_o2_e_" in model.reactions:
model.reactions.EX_o2_e_.lower_bound = -float(oxygen_override)
applied["EX_o2_e_"] = -float(oxygen_override)
logging.info(f"GC+Vitox medium applied with {len(applied)} exchanges set")
return applied
# =========================
# Core analyses
# =========================
def run_fba(model, biomass):
sol = model.optimize()
mu = sol.objective_value
if mu>0:
dt = np.log(2)*60.0/mu
else:
dt = np.inf
logging.info(f"FBA: mu={mu:.4f} 1/h, DT={dt:.2f} min")
return mu, dt, sol
def run_fva(model):
fva_res = flux_variability_analysis(model, fraction_of_optimum=1.0)
return fva_res
def run_sgd(model):
sgd_res = single_gene_deletion(model)
return sgd_res
# =========================
# RIPTiDe contextualization
# =========================
def contextualize_with_riptide(model, expr_path, outdir):
"""
Contextualize model using RIPTiDe with transcriptomic data
- Exchange bounds: ±10 (O2 ±20)
- maxfit_contextualize(min_frac=0.1, max_frac=0.8, n=1000)
- Sampling n=500
"""
# Load transcriptomics (assume gene ID → TPM/FPKM)
expr = pd.read_csv(expr_path, index_col=0, squeeze=True)
logging.info(f"Loaded expression file {expr_path} with {len(expr)} entries")
# Set uniform exchange bounds ±10, except oxygen ±20,先统一设置 exchange bounds (±10, O₂=±20)
for rxn in model.exchanges:
if "o2" in rxn.id.lower():
rxn.lower_bound = -20
else:
rxn.lower_bound = -10
rxn.upper_bound = 1000.0
# Contextualize,筛选与表达数据一致的代谢子网
ctx = riptide.maxfit_contextualize(model, expr,
min_frac=0.1, max_frac=0.8, n=1000)
logging.info("RIPTiDe contextualization complete")
# Sampling,做 500 次 flux 抽样
samples = riptide.sample(ctx, n=500)
df = pd.DataFrame(samples, columns=[r.id for r in ctx.reactions])
df.to_csv(os.path.join(outdir, "riptide_samples.tsv"), sep="\t")
logging.info(f"RIPTiDe sampling complete, saved {df.shape} flux profiles")
return df
# =========================
# Downstream statistics
# =========================
def analyze_flux_profiles(df, metadata, outdir):
"""
Perform:
- Bray–Curtis NMDS
- PERMANOVA
- RandomForest classification with AUC
"""
# Compute Bray–Curtis dissimilarity,计算 Bray–Curtis 距离矩阵
dist = beta_diversity("braycurtis", df.values, ids=df.index)
# 使用 MDS 实现 NMDS
nmds = MDS(n_components=2,
metric=False,
max_iter=3000,
eps=1e-12,
dissimilarity='precomputed',
random_state=42,
n_jobs=1,
n_init=1)
# NMDS ordination (2D),用 NMDS 可视化群落代谢流差异
coords_array = nmds.fit_transform(dist.data)
coords = pd.DataFrame(coords_array, index=df.index, columns=["NMDS1", "NMDS2"])
coords.to_csv(os.path.join(outdir, "nmds_coords.tsv"), sep="\t")
# PERMANOVA,用 PERMANOVA 检验组间差异是否显著
meta = pd.Series(metadata, index=df.index)
perma_res = permanova(dist, meta, permutations=999)
with open(os.path.join(outdir,"permanova.txt"),"w") as f:
f.write(str(perma_res))
# RandomForest classification,用 随机森林 分类 flux profiles,输出 AUC 评估判别力
clf = RandomForestClassifier(n_estimators=1500, max_features=20, random_state=42)
y = meta.values
clf.fit(df.values, y)
# AUC (binary assumed)
if len(set(y))==2:
probs = clf.predict_proba(df.values)[:,1]
auc = roc_auc_score(y, probs)
else:
auc = np.nan
with open(os.path.join(outdir,"rf_auc.txt"),"w") as f:
f.write(f"AUC={auc}\n")
logging.info("Downstream statistics complete")
# =========================
# Main
# =========================
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--sbml", default="NGO_557.sbml") # 默认模型路径
ap.add_argument("--s4", default="msystems.01265-22-s0004.xlsx")
ap.add_argument("--expr", help="Transcriptomics file for RIPTiDe")
ap.add_argument("--mode", choices=["s4","custom_gc_vitox"], default="s4")
ap.add_argument("--scaling", choices=["equally","molarity"], default="molarity")
ap.add_argument("--oxygen_override", type=float, default=None)
ap.add_argument("--outdir", default="results_out") # 默认输出文件夹
args = ap.parse_args()
os.makedirs(args.outdir,exist_ok=True)
setup_logger(args.outdir)
model = read_sbml(args.sbml)
biomass = find_biomass_reaction(model)
find_atpm_reaction(model)
# Media setup
if args.mode=="s4":
if not args.s4:
logging.error("S4 path required for mode s4")
return
s4df = load_s4_gcb(args.s4)
build_media_from_s4(model,s4df,args.scaling,oxygen_override=args.oxygen_override)
elif args.mode=="custom_gc_vitox":
build_media_gc_vitox(model,scaling=args.scaling,oxygen_override=args.oxygen_override)
# Analyses
mu,dt,sol = run_fba(model,biomass)
fva = run_fva(model); fva.to_csv(os.path.join(args.outdir,"fva.tsv"),sep="\t")
sgd = run_sgd(model); sgd.to_csv(os.path.join(args.outdir,"sgd.tsv"),sep="\t")
# RIPTiDe if transcriptomics provided
if args.expr:
flux_df = contextualize_with_riptide(model, args.expr, args.outdir)
# downstream stats skeleton (needs metadata, here dummy example)
metadata = {idx: ("Group1" if i < len(flux_df)//2 else "Group2")
for i,idx in enumerate(flux_df.index)}
analyze_flux_profiles(flux_df, metadata, args.outdir)
logging.info("Pipeline complete")
if __name__=="__main__":
main()
这段代码的报错为:D:\python+pycharm\Miniconda\envs\skbio-python39\python.exe D:/实验室/python/25.8.22.py
Traceback (most recent call last):
File "D:\实验室\python\25.8.22.py", line 283, in <module>
main()
File "D:\实验室\python\25.8.22.py", line 253, in main
model = read_sbml(args.sbml)
File "D:\实验室\python\25.8.22.py", line 33, in read_sbml
model = cobra.io.read_sbml_model(path)
File "D:\python+pycharm\Miniconda\envs\skbio-python39\lib\site-packages\cobra\io\sbml.py", line 460, in read_sbml_model
raise e
File "D:\python+pycharm\Miniconda\envs\skbio-python39\lib\site-packages\cobra\io\sbml.py", line 457, in read_sbml_model
doc = _get_doc_from_filename(filename)
File "D:\python+pycharm\Miniconda\envs\skbio-python39\lib\site-packages\cobra\io\sbml.py", line 504, in _get_doc_from_filename
raise IOError(
OSError: The file with 'NGO_557.sbml' does not exist, or is not an SBML string. Provide the path to an existing SBML file or a valid SBML string representation:
进程已结束,退出代码1