20171030_chr_download 文件下载改进版

文件下载改进版

  • /20171030_chr_download/src/nuc/sw/action/DownloadAction.java
package nuc.sw.action;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;

import com.opensymphony.xwork2.ActionSupport;

public class DownloadAction extends ActionSupport {
    private String inputPath;
    private String contentType;
    private String downFileName;
    public String getInputPath() {
        return inputPath;
    }
    public void setInputPath(String inputPath) {    
            try {
                this.inputPath = new String(inputPath.getBytes("iso8859-1"),"utf-8");
            } catch (UnsupportedEncodingException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }       
    }
    public String getContentType() {
        return contentType;
    }
    public void setContentType(String contentType) {
        this.contentType = contentType;
    }
    public String getDownFileName() throws UnsupportedEncodingException {
                return URLEncoder.encode(downFileName, "utf-8");
    }
    public void setDownFileName(String downFileName) {
        try {
            this.downFileName = new String(downFileName.getBytes("iso8859-1"),"utf-8");
        } catch (UnsupportedEncodingException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    //下载文件的入口
    public InputStream getTargetFile() {
        /*InputStream is=ServletActionContext.getServletContext().getResourceAsStream(inputPath);
        System.out.println(is);
        return is;*/
        InputStream is=null;
        try {
            is = new FileInputStream(inputPath);
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return is;
    }
}
  • /20171030_chr_download/src/struts.xml
<struts>
    <!-- Add packages here -->
    <package name="downloadPackage" namespace="/" extends="struts-default">
      <action name="download" class="nuc.sw.action.DownloadAction">
       <result type="stream">
        <param name="contentType">${contentType}</param>
        <param name="inputName">targetFile</param>
        <param name="contentDisposition">attachment;filename="${downFileName}"</param>     
       </result>
      </action>
     </package>
</struts>
  • /20171030_chr_download/WebContent/download.jsp
<%@ page language="java" contentType="text/html; charset=UTF-8"
    pageEncoding="UTF-8"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>Insert title here</title>
</head>
<body>
 <a href="download?inputPath=F:\123.txt&contentType=text/plain&downFileName=123.txt">使用struts2框架下载</a>
 </form>
</body>
</html>
>>> splice_data, spliced_ptms, altered_flanks = project.project_ptms_onto_MATS(SE_events\ = SE_data, MXE_events = MXE_data, A5SS_events = A5SS_data, A3SS_events = A3SS_data, RI_\ events = RI_data, coordinate_type = 'hg38', identify_flanking_sequences = True) Projecting PTMs onto MATS splice events using hg38 coordinates. Skipped Exon events: 0%| | 0/3635 [00:00<?, ?it/s] Traceback (most recent call last): File "<python-input-28>", line 1, in <module> splice_data, spliced_ptms, altered_flanks = project.project_ptms_onto_MATS(SE_events = SE_data, MXE_events = MXE_data, A5SS_events = A5SS_data, A3SS_events = A3SS_data, RI_events = RI_data, coordinate_type = 'hg38', identify_flanking_sequences = True) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/mnt/940660EA0660CEB4/PTM-POSE/venv/lib/python3.13/site-packages/ptm_pose/project.py", line 416, in project_ptms_onto_MATS spliced_events['SE'], SE_ptms = project_ptms_onto_splice_events(SE_events, annotate_original_df=True, ptm_coordinates = ptm_coordinates, chromosome_col = 'chr', strand_col = 'strand', region_start_col = 'exonStart_0base', region_end_col = 'exonEnd', dPSI_col=dPSI_col, sig_col = sig_col, gene_col = 'geneSymbol', event_id_col = 'AS ID', extra_cols = extra_cols, coordinate_type=coordinate_type, start_coordinate_system='0-based', taskbar_label = "Skipped Exon events", separate_modification_types=separate_modification_types, PROCESSES = SE_processes) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/mnt/940660EA0660CEB4/PTM-POSE/venv/lib/python3.13/site-packages/ptm_pose/project.py", line 327, in project_ptms_onto_splice_events splice_data, spliced_ptm_info = find_ptms_in_many_regions(splice_data, ptm_coordinates, chromosome_col = chromosome_col, strand_col = strand_col, region_start_col = region_start_col, region_end_col = region_end_col, dPSI_col = dPSI_col, sig_col = sig_col, event_id_col = event_id_col, gene_col = gene_col, extra_cols = extra_cols, annotate_original_df = annotate_original_df, coordinate_type = coordinate_type,start_coordinate_system=start_coordinate_system, end_coordinate_system=end_coordinate_system, taskbar_label = taskbar_label, separate_modification_types=separate_modification_types) ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/mnt/940660EA0660CEB4/PTM-POSE/venv/lib/python3.13/site-packages/ptm_pose/project.py", line 212, in find_ptms_in_many_regions if annotate_original_df: ^^^^^^^^^^^^^^^^^^^^ File "/mnt/940660EA0660CEB4/PTM-POSE/venv/lib/python3.13/site-packages/pandas/core/generic.py", line 1577, in __nonzero__ raise ValueError( ...<2 lines>... ) ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all(). 脚本project.py内容: import numpy as np import pandas as pd import multiprocessing import datetime from ptm_pose import pose_config, helpers from ptm_pose import flanking_sequences as fs from tqdm import tqdm def find_ptms_in_region(ptm_coordinates, chromosome, strand, start, end, gene = None, coordinate_type = 'hg38'): """ Given an genomic region in either hg38 or hg19 coordinates (such as the region encoding an exon of interest), identify PTMs that are mapped to that region. If so, return the exon number. If none are found, return np.nan. Parameters ---------- chromosome: str chromosome where region is located strand: int DNA strand for region is found on (1 for forward, -1 for reverse) start: int start position of region on the chromosome/strand (should always be less than end) end: int end position of region on the chromosome/strand (should always be greater than start) coordinate_type: str indicates the coordinate system used for the start and end positions. Either hg38 or hg19. Default is 'hg38'. Returns ------- ptms_in_region: pandas.DataFrame dataframe containing all PTMs found in the region. If no PTMs are found, returns np.nan. """ #restrict to PTMs on the same chromosome and strand ptms_in_region = ptm_coordinates[(ptm_coordinates['Chromosome/scaffold name'] == chromosome) & (ptm_coordinates['Strand'] == strand)].copy() if coordinate_type in ['hg18', 'hg19','hg38']: loc_col = f'Gene Location ({coordinate_type})' else: raise ValueError('Coordinate type must be hg38 or hg19') #check to make sure the start value is less than the end coordinate. If it is not, treat the end coordinate as the start and the start coordinate as the end if start < end: ptms_in_region = ptms_in_region[(ptms_in_region[loc_col] >= start) & (ptms_in_region[loc_col] <= end)] else: ptms_in_region = ptms_in_region[(ptms_in_region[loc_col] <= start) & (ptms_in_region[loc_col] >= end)] #extract only PTM information from dataframe and return that and list (if not ptms, return empty dataframe) if not ptms_in_region.empty: #grab uniprot id and residue ptms_in_region = ptms_in_region[['Source of PTM', 'UniProtKB Accession','Isoform ID', 'Isoform Type', 'Residue', 'PTM Position in Isoform', loc_col, 'Modification', 'Modification Class', 'Canonical Flanking Sequence', 'Constitutive', 'MS_LIT', 'MS_CST', 'LT_LIT', 'Compendia', 'Number of Compendia']] #check if ptm is associated with the same gene (if info is provided). if not, do not add if gene is not None: for i, row in ptms_in_region.iterrows(): #if ';' in row['UniProtKB Accession']: # uni_ids = row['UniProtKB Accession'].split(';') # remove = True # for uni in uni_ids: # if row['UniProtKB Accession'] in pose_config.uniprot_to_genename: # if gene in pose_config.uniprot_to_genename[uni.split('-')[0]].split(' '): # remove = False # break # if remove: # ptms_in_region.drop(i) if row['UniProtKB Accession'] in pose_config.uniprot_to_genename: if gene not in pose_config.uniprot_to_genename[row['UniProtKB Accession']].split(' '): ptms_in_region = ptms_in_region.drop(i) else: ptms_in_region = ptms_in_region.drop(i) #make sure ptms still are present after filtering if ptms_in_region.empty: return pd.DataFrame() else: ptms_in_region.insert(0, 'Gene', gene) #calculate proximity to region start and end ptms_in_region['Proximity to Region Start (bp)'] = (ptms_in_region[loc_col] - start).abs() ptms_in_region['Proximity to Region End (bp)'] = (ptms_in_region[loc_col] - end).abs() ptms_in_region['Proximity to Splice Boundary (bp)'] = ptms_in_region.apply(lambda x: min(x['Proximity to Region Start (bp)'], x['Proximity to Region End (bp)']), axis = 1) return ptms_in_region else: return pd.DataFrame() def convert_strand_symbol(strand): """ Given DNA strand information, make sure the strand information is in integer format (1 for forward, -1 for reverse). This is intended to convert from string format ('+' or '-') to integer format (1 or -1), but will return the input if it is already in integer format. Parameters ---------- strand: str or int DNA strand information, either as a string ('+' or '-') or an integer (1 or -1) Returns ------- int DNA strand information as an integer (1 for forward, -1 for reverse) """ if isinstance(strand, str): if strand == '+' or strand == '1': return 1 elif strand == '-' or strand == '-1': return -1 else: return strand def find_ptms_in_many_regions(region_data, ptm_coordinates, annotate_original_df=True, chromosome_col = 'chr', strand_col = 'strand', region_start_col = 'exonStart_0base', region_end_col = 'exonEnd', gene_col = None, dPSI_col = None, sig_col = None, event_id_col = None, extra_cols = None, annotate_original_df = True, coordinate_type = 'hg38', start_coordinate_system = '1-based', end_coordinate_system = '1-based', separate_modification_types = False, taskbar_label = None): """ Given a dataframe with a unique region in each row, project PTMs onto the regions. Assumes that the region data will have chromosome, strand, and genomic start/end positions, and each row corresponds to a unique region. Parameters ---------- ptm_coordinates: pandas.DataFrame dataframe containing PTM information, including chromosome, strand, and genomic location of PTMs region_data: pandas.DataFrame dataframe containing region information, including chromosome, strand, and genomic location of regions of interest chromosome_col: str column name in splice_data that contains chromosome information. Default is 'chr'. Expects it to be a str with only the chromosome number: 'Y', '1', '2', etc. strand_col: str column name in splice_data that contains strand information. Default is 'strand'. Expects it to be a str with '+' or '-', or integers as 1 or -1. Will convert to integers automatically if string format is provided. region_start_col: str column name in splice_data that contains the start position of the region of interest. Default is 'exonStart_0base'. region_end_col: str column name in splice_data that contains the end position of the region of interest. Default is 'exonEnd'. gene_col: str column name in splice_data that contains the gene name. If provided, will be used to make sure the projected PTMs stem from the same gene (some cases where genomic coordiantes overlap between distinct genes). Default is None. event_id_col: str column name in splice_data that contains the unique identifier for the splice event. If provided, will be used to annotate the ptm information with the specific splice event ID. Default is None. coordinate_type: str indicates the coordinate system used for the start and end positions. Either hg38 or hg19. Default is 'hg38'. separate_modification_types: bool Indicate whether to store PTM sites with multiple modification types as multiple rows. For example, if a site at K100 was both an acetylation and methylation site, these will be separated into unique rows with the same site number but different modification types. Default is True. taskbar_label: str Label to display in the tqdm progress bar. Default is None, which will automatically state "Projecting PTMs onto regions using ----- coordinates". Returns ------- spliced_ptm_info: pandas.DataFrame Contains the PTMs identified across the different splice events splice_data: pandas.DataFrame dataframe containing the original splice data with an additional column 'PTMs' that contains the PTMs found in the region of interest, in the format of 'SiteNumber(ModificationType)'. If no PTMs are found, the value will be np.nan. """ if taskbar_label is None: taskbar_label = 'Projecting PTMs onto regions using ' + coordinate_type + ' coordinates.' if region_data[chromosome_col].str.contains('chr').any(): region_data[chromosome_col] = region_data[chromosome_col].str.strip('chr') spliced_ptm_info = [] spliced_ptms_list = [] num_ptms_affected = [] num_unique_ptm_sites = [] #copy region_data = region_data.copy() #iterate through each row of the splice data and find PTMs in the region for index, row in tqdm(region_data.iterrows(), total = len(region_data), desc = taskbar_label): #grab region information from row chromosome = row[chromosome_col] strand = convert_strand_symbol(row[strand_col]) start = row[region_start_col] end = row[region_end_col] #only provide these if column is given gene = row[gene_col] if gene_col is not None else None #adjust region coordinates if needed (make sure in 1-based coordinate system) if start_coordinate_system == '0-based': start += 1 elif start_coordinate_system != '1-based': raise ValueError("Start coordinate system must be either '0-based' or '1-based'") if end_coordinate_system == '0-based': end += 1 elif end_coordinate_system != '1-based': raise ValueError("End coordinate system must be either '0-based' or '1-based'") #project ptms onto region ptms_in_region = find_ptms_in_region(ptm_coordinates, chromosome, strand, start, end, gene = gene, coordinate_type = coordinate_type) extra_info = {} #add additional context from splice data, if indicated extra_info = {} if event_id_col is not None: extra_info['Region ID'] = row[event_id_col] if dPSI_col is not None: extra_info['dPSI'] = row[dPSI_col] if sig_col is not None: extra_info['Significance'] = row[sig_col] if extra_cols is not None: for col in extra_cols: extra_info[col] = row[col] #add extra info to ptms_in_region ptms_in_region = pd.concat([pd.DataFrame(extra_info, index = ptms_in_region.index), ptms_in_region], axis = 1) #if desired, add ptm information to the original splice event dataframe if annotate_original_df: if not ptms_in_region.empty: #split and separate unique modification types if separate_modification_types: ptms_in_region['Modification Class'] = ptms_in_region['Modification Class'].str.split(';') ptms_in_region = ptms_in_region.explode('Modification Class') ptms_info = ptms_in_region.apply(lambda x: x['UniProtKB Accession'] + '_' + x['Residue'] + str(x['PTM Position in Isoform']) + ' (' + x['Modification Class'] + ')', axis = 1) ptms_str = '/'.join(ptms_info.values) spliced_ptms_list.append(ptms_str) num_ptms_affected.append(ptms_in_region.shape[0]) num_unique_ptm_sites.append(ptms_in_region.groupby(['UniProtKB Accession', 'Residue', 'PTM Position in Isoform']).size().shape[0]) else: spliced_ptms_list.append(np.nan) num_ptms_affected.append(0) num_unique_ptm_sites.append(0) spliced_ptm_info.append(ptms_in_region.copy()) #combine all PTM information spliced_ptm_info = pd.concat(spliced_ptm_info, ignore_index = True) #convert ptm position to float if spliced_ptm_info.shape[0] > 0: spliced_ptm_info['PTM Position in Isoform'] = spliced_ptm_info['PTM Position in Isoform'].astype(float) #add ptm info to original splice event dataframe if annotate_original_df: region_data['PTMs'] = spliced_ptms_list region_data['Number of PTMs Affected'] = num_ptms_affected region_data['Number of Unique PTM Sites by Position'] = num_unique_ptm_sites region_data['Event Length'] = (region_data[region_end_col] - region_data[region_start_col]).abs() region_data['PTM Density (PTMs/bp)'] = (region_data['Number of Unique PTM Sites by Position']*3)/region_data['Event Length'] #multiply by 3 to convert aa to bp (3 bp per codon) return region_data, spliced_ptm_info def project_ptms_onto_splice_events(splice_data, annotate_original_df = True, chromosome_col = 'chr', strand_col = 'strand', region_start_col = 'exonStart_0base', region_end_col = 'exonEnd', dPSI_col = None, sig_col = None, event_id_col = None, gene_col = None, extra_cols = None, separate_modification_types = False, coordinate_type = 'hg38', start_coordinate_system = '1-based', end_coordinate_system = '1-based', taskbar_label = None, ptm_coordinates = None,PROCESSES = 1, **kwargs): """ Given splice event quantification data, project PTMs onto the regions impacted by the splice events. Assumes that the splice event data will have chromosome, strand, and genomic start/end positions for the regions of interest, and each row of the splice_event_data corresponds to a unique region. Important note: PTM-POSE relies on Ensembl based coordinates (1-based), so if the coordinates are 0-based, make sure to indicate using the start_coordinate_system and end_coordinate_system parameters. For example, rMATS uses 0-based for the start coordinates, but 1-based for the end coordinates. In this case, set start_coordinate_system = '0-based' and end_coordinate_system = '1-based'. Parameters ---------- splice_data: pandas.DataFrame dataframe containing splice event information, including chromosome, strand, and genomic location of regions of interest ptm_coordinates: pandas.DataFrame dataframe containing PTM information, including chromosome, strand, and genomic location of PTMs. If none, it will pull from the config file. chromosome_col: str column name in splice_data that contains chromosome information. Default is 'chr'. Expects it to be a str with only the chromosome number: 'Y', '1', '2', etc. strand_col: str column name in splice_data that contains strand information. Default is 'strand'. Expects it to be a str with '+' or '-', or integers as 1 or -1. Will convert to integers automatically if string format is provided. region_start_col: str column name in splice_data that contains the start position of the region of interest. Default is 'exonStart_0base'. region_end_col: str column name in splice_data that contains the end position of the region of interest. Default is 'exonEnd'. event_id_col: str column name in splice_data that contains the unique identifier for the splice event. If provided, will be used to annotate the ptm information with the specific splice event ID. Default is None. gene_col: str column name in splice_data that contains the gene name. If provided, will be used to make sure the projected PTMs stem from the same gene (some cases where genomic coordiantes overlap between distinct genes). Default is None. dPSI_col: str column name in splice_data that contains the delta PSI value for the splice event. Default is None, which will not include this information in the output sig_col: str column name in splice_data that contains the significance value for the splice event. Default is None, which will not include this information in the output. extra_cols: list list of additional columns to include in the output dataframe. Default is None, which will not include any additional columns. coordinate_type: str indicates the coordinate system used for the start and end positions. Either hg38 or hg19. Default is 'hg38'. start_coordinate_system: str indicates the coordinate system used for the start position. Either '0-based' or '1-based'. Default is '1-based'. end_coordinate_system: str indicates the coordinate system used for the end position. Either '0-based' or '1-based'. Default is '1-based'. separate_modification_types: bool Indicate whether to store PTM sites with multiple modification types as multiple rows. For example, if a site at K100 was both an acetylation and methylation site, these will be separated into unique rows with the same site number but different modification types. Default is True. taskbar_label: str Label to display in the tqdm progress bar. Default is None, which will automatically state "Projecting PTMs onto regions using ----- coordinates". PROCESSES: int Number of processes to use for multiprocessing. Default is 1 (single processing) **kwargs: additional keyword arguments Additional keyword arguments to pass to the find_ptms_in_many_regions function, which will be fed into the `filter_ptms()` function from the helper module. These will be used to filter ptms with lower evidence. For example, if you want to filter PTMs based on the number of MS observations, you can add 'min_MS_observations = 2' to the kwargs. This will filter out any PTMs that have less than 2 MS observations. See the `filter_ptms()` function for more options. Returns ------- spliced_ptm_info: pandas.DataFrame Contains the PTMs identified across the different splice events splice_data: pandas.DataFrame dataframe containing the original splice data with an additional column 'PTMs' that contains the PTMs found in the region of interest, in the format of 'SiteNumber(ModificationType)'. If no PTMs are found, the value will be np.nan. """ #load ptm data from config if not provided if ptm_coordinates is None: ptm_coordinates = pose_config.ptm_coordinates.copy() #check for any keyword arguments to use for filtering if kwargs: filter_arguments = helpers.extract_filter_kwargs(**kwargs) #check any excess unused keyword arguments, report them helpers.check_filter_kwargs(filter_arguments) #filter ptm coordinates file to include only ptms with desired evidence ptm_coordinates = helpers.filter_ptms(ptm_coordinates, **filter_arguments) if taskbar_label is None: taskbar_label = 'Projecting PTMs onto splice events using ' + coordinate_type + ' coordinates.' #copy splice_data = splice_data.copy() #check columns to make sure they are present and correct data type check_columns(splice_data, chromosome_col=chromosome_col, strand_col=strand_col, region_start_col=region_start_col, region_end_col=region_end_col, dPSI_col=dPSI_col, sig_col=sig_col, event_id_col=event_id_col, gene_col=gene_col, extra_cols=extra_cols) if PROCESSES == 1: splice_data, spliced_ptm_info = find_ptms_in_many_regions(splice_data, ptm_coordinates, chromosome_col = chromosome_col, strand_col = strand_col, region_start_col = region_start_col, region_end_col = region_end_col, dPSI_col = dPSI_col, sig_col = sig_col, event_id_col = event_id_col, gene_col = gene_col, extra_cols = extra_cols, annotate_original_df = annotate_original_df, coordinate_type = coordinate_type,start_coordinate_system=start_coordinate_system, end_coordinate_system=end_coordinate_system, taskbar_label = taskbar_label, separate_modification_types=separate_modification_types) elif PROCESSES > 1: #check num_cpus available, if greater than number of cores - 1 (to avoid freezing machine), then set to PROCESSES to 1 less than total number of cores num_cores = multiprocessing.cpu_count() if PROCESSES > num_cores - 1: PROCESSES = num_cores - 1 #split dataframe into chunks equal to PROCESSES splice_data_split = np.array_split(splice_data, PROCESSES) pool = multiprocessing.Pool(PROCESSES) #run with multiprocessing results = pool.starmap(find_ptms_in_many_regions, [(splice_data_split[i], ptm_coordinates, chromosome_col, strand_col, region_start_col, region_end_col, gene_col, dPSI_col, sig_col, event_id_col, extra_cols, annotate_original_df, coordinate_type, start_coordinate_system, end_coordinate_system, separate_modification_types, taskbar_label) for i in range(PROCESSES)]) splice_data = pd.concat([res[0] for res in results]) spliced_ptm_info = pd.concat([res[1] for res in results]) #raise ValueError('Multiprocessing not yet functional. Please set PROCESSES = 1.') print(f'PTMs projection successful ({spliced_ptm_info.shape[0]} identified).\n') return splice_data, spliced_ptm_info def project_ptms_onto_MATS(SE_events = None, A5SS_events = None, A3SS_events = None, RI_events = None, MXE_events = None, coordinate_type = 'hg38', identify_flanking_sequences = False, dPSI_col = 'meanDeltaPSI', sig_col = 'FDR', extra_cols = None, separate_modification_types = False, PROCESSES = 1,ptm_coordinates = None, **kwargs): """ Given splice quantification from the MATS algorithm, annotate with PTMs that are found in the differentially included regions. Parameters ---------- ptm_coordinates: pandas.DataFrame dataframe containing PTM information, including chromosome, strand, and genomic location of PTMs SE_events: pandas.DataFrame dataframe containing skipped exon event information from MATS A5SS_events: pandas.DataFrame dataframe containing 5' alternative splice site event information from MATS A3SS_events: pandas.DataFrame dataframe containing 3' alternative splice site event information from MATS RI_events: pandas.DataFrame dataframe containing retained intron event information from MATS MXE_events: pandas.DataFrame dataframe containing mutually exclusive exon event information from MATS coordinate_type: str indicates the coordinate system used for the start and end positions. Either hg38 or hg19. Default is 'hg38'. dPSI_col: str Column name indicating delta PSI value. Default is 'meanDeltaPSI'. sig_col: str Column name indicating significance of the event. Default is 'FDR'. extra_cols: list List of column names for additional information to add to the results. Default is None. separate_modification_types: bool Indicate whether residues with multiple modifications (i.e. phosphorylation and acetylation) should be treated as separate PTMs and be placed in unique rows of the output dataframe. Default is False. PROCESSES: int Number of processes to use for multiprocessing. Default is 1. **kwargs: additional keyword arguments Additional keyword arguments to pass to the find_ptms_in_many_regions function, which will be fed into the `filter_ptms()` function from the helper module. These will be used to filter ptms with lower evidence. For example, if you want to filter PTMs based on the number of MS observations, you can add 'min_MS_observations = 2' to the kwargs. This will filter out any PTMs that have less than 2 MS observations. See the `filter_ptms()` function for more options. """ #load ptm data from config if not provided if ptm_coordinates is None: ptm_coordinates = pose_config.ptm_coordinates.copy() #check for any keyword arguments to use for filtering if kwargs: filter_arguments = helpers.extract_filter_kwargs(**kwargs) #check any excess unused keyword arguments, report them helpers.check_filter_kwargs(filter_arguments) #filter ptm coordinates file to include only ptms with desired evidence ptm_coordinates = helpers.filter_ptms(ptm_coordinates, **filter_arguments) print(f'Projecting PTMs onto MATS splice events using {coordinate_type} coordinates.') #reformat chromosome name format spliced_events = {} spliced_flanks = [] spliced_ptms = [] if SE_events is not None: if SE_events['chr'].str.contains('chr').any(): SE_events['chr'] = SE_events['chr'].apply(lambda x: x[3:]) SE_events['AS ID'] = "SE_" + SE_events.index.astype(str) #check to make sure there is enough information to do multiprocessing if that is desired if PROCESSES*4 > SE_events.shape[0]: SE_processes = 1 else: SE_processes = PROCESSES spliced_events['SE'], SE_ptms = project_ptms_onto_splice_events(SE_events, annotate_original_df=True, ptm_coordinates = ptm_coordinates, chromosome_col = 'chr', strand_col = 'strand', region_start_col = 'exonStart_0base', region_end_col = 'exonEnd', dPSI_col=dPSI_col, sig_col = sig_col, gene_col = 'geneSymbol', event_id_col = 'AS ID', extra_cols = extra_cols, coordinate_type=coordinate_type, start_coordinate_system='0-based', taskbar_label = "Skipped Exon events", separate_modification_types=separate_modification_types, PROCESSES = SE_processes) SE_ptms['Event Type'] = 'SE' spliced_ptms.append(SE_ptms) if identify_flanking_sequences: print('Identifying flanking sequences for skipped exon events.') if 'upstreamES' in SE_events.columns: first_flank_start_col = 'upstreamES' first_flank_end_col = 'upstreamEE' second_flank_start_col = 'downstreamES' second_flank_end_col = 'downstreamEE' elif 'firstFlankingES' in SE_events.columns: first_flank_start_col = 'firstFlankingES' first_flank_end_col = 'firstFlankingEE' second_flank_start_col = 'secondFlankingES' second_flank_end_col = 'secondFlankingEE' else: raise ValueError('Could not find flanking sequence columns in skipped exon event data, based on what is typically outputted by MATS. Please check column names and provide the appropriate columns for the first and second flanking sequences') SE_flanks = fs.get_flanking_changes_from_splice_data(SE_events, ptm_coordinates, chromosome_col = 'chr', strand_col = 'strand', spliced_region_start_col = 'exonStart_0base', spliced_region_end_col = 'exonEnd', first_flank_start_col = first_flank_start_col, first_flank_end_col = first_flank_end_col, second_flank_start_col = second_flank_start_col, second_flank_end_col = second_flank_end_col, dPSI_col=dPSI_col, sig_col = sig_col, gene_col = 'geneSymbol', event_id_col = 'AS ID', extra_cols = extra_cols, coordinate_type=coordinate_type, start_coordinate_system='0-based') SE_flanks['Event Type'] = 'SE' spliced_flanks.append(SE_flanks) else: print('Skipped exon event data (SE_events) not provided, skipping') if A5SS_events is not None: if A5SS_events['chr'].str.contains('chr').any(): A5SS_events['chr'] = A5SS['chr'].apply(lambda x: x[3:]) #set the relevent start and end regions of the spliced out region, which are different depending on the strand region_start = [] region_end = [] first_flank_start = [] first_flank_end = [] second_flank_end = [] second_flank_start = [] for i, row in A5SS_events.iterrows(): strand = row['strand'] if strand == '+': region_start.append(row['shortEE']) region_end.append(row['longExonEnd']) if identify_flanking_sequences: first_flank_start.append(row['shortES']) first_flank_end.append(row['shortEE']) second_flank_start.append(row['flankingES']) second_flank_end.append(row['flankingEE']) else: region_start.append(row['longExonStart_0base']) region_end.append(row['shortES']) if identify_flanking_sequences: second_flank_start.append(row['shortES']) second_flank_end.append(row['shortEE']) first_flank_start.append(row['flankingES']) first_flank_end.append(row['flankingEE']) A5SS_events['event_start'] = region_start A5SS_events['event_end'] = region_end if identify_flanking_sequences: A5SS_events['first_flank_start'] = first_flank_start A5SS_events['first_flank_end'] = first_flank_end A5SS_events['second_flank_start'] = second_flank_start A5SS_events['second_flank_end'] = second_flank_end #set specific as id A5SS_events['AS ID'] = "5ASS_" + A5SS_events.index.astype(str) #check to make sure there is enough information to do multiprocessing if that is desired if PROCESSES*4 > A5SS_events.shape[0]: fiveASS_processes = 1 else: fiveASS_processes = PROCESSES #identify PTMs found within spliced regions spliced_events['5ASS'], fiveASS_ptms = project_ptms_onto_splice_events(A5SS_events, annotate_original_df=True, ptm_coordinates = ptm_coordinates, chromosome_col = 'chr', strand_col = 'strand', region_start_col = 'event_start', region_end_col = 'event_end', event_id_col = 'AS ID', dPSI_col=dPSI_col, sig_col = sig_col, gene_col = 'geneSymbol', coordinate_type=coordinate_type, start_coordinate_system = '0-based', extra_cols = extra_cols, taskbar_label = "5' ASS events", separate_modification_types=separate_modification_types, PROCESSES = fiveASS_processes) fiveASS_ptms['Event Type'] = '5ASS' spliced_ptms.append(fiveASS_ptms) #identify ptms with altered flanking sequences if identify_flanking_sequences: print("Identifying flanking sequences for 5'ASS events.") fiveASS_flanks = fs.get_flanking_changes_from_splice_data(A5SS_events, ptm_coordinates, chromosome_col = 'chr', strand_col = 'strand', spliced_region_start_col = 'event_start', spliced_region_end_col = 'event_end', first_flank_start_col = 'first_flank_start', first_flank_end_col = 'first_flank_end', second_flank_start_col = 'second_flank_start', second_flank_end_col = 'second_flank_end',dPSI_col=dPSI_col, sig_col = sig_col, gene_col = 'geneSymbol', event_id_col = 'AS ID', extra_cols = extra_cols, coordinate_type=coordinate_type, start_coordinate_system='0-based') fiveASS_flanks['Event Type'] = '5ASS' spliced_flanks.append(fiveASS_flanks) else: print("5' ASS event data (A5SS_events) not provided, skipping.") if A3SS_events is not None: if RI_events['chr'].str.contains('chr').any(): RI_events['chr'] = RI_events['chr'].apply(lambda x: x[3:]) if A3SS_events['chr'].str.contains('chr').any(): A3SS_events['chr'] = A3SS_events['chr'].apply(lambda x: x[3:]) #set the relevent start and end regions of the spliced out region, which are different depending on the strand region_start = [] region_end = [] first_flank_start = [] first_flank_end = [] second_flank_end = [] second_flank_start = [] for i, row in A3SS_events.iterrows(): strand = row['strand'] if strand == '+': region_start.append(row['longExonStart_0base']) region_end.append(row['shortES']) if identify_flanking_sequences: second_flank_start.append(row['flankingES']) second_flank_end.append(row['flankingEE']) first_flank_start.append(row['shortES']) first_flank_end.append(row['shortEE']) else: region_start.append(row['shortEE']) region_end.append(row['longExonEnd']) if identify_flanking_sequences: second_flank_start.append(row['flankingES']) second_flank_end.append(row['flankingEE']) first_flank_start.append(row['shortES']) first_flank_end.append(row['shortEE']) #save region info A3SS_events['event_start'] = region_start A3SS_events['event_end'] = region_end if identify_flanking_sequences: A3SS_events['first_flank_start'] = first_flank_start A3SS_events['first_flank_end'] = first_flank_end A3SS_events['second_flank_start'] = second_flank_start A3SS_events['second_flank_end'] = second_flank_end #add event ids A3SS_events['AS ID'] = "3ASS_" + A3SS_events.index.astype(str) #check to make sure there is enough information to do multiprocessing if that is desired if PROCESSES*4 > A3SS_events.shape[0]: threeASS_processes = 1 else: threeASS_processes = PROCESSES spliced_events['3ASS'], threeASS_ptms = project_ptms_onto_splice_events(A3SS_events, annotate_original_df=True, ptm_coordinates = ptm_coordinates, chromosome_col = 'chr', strand_col = 'strand', region_start_col = 'event_start', region_end_col = 'event_end', event_id_col = 'AS ID', dPSI_col=dPSI_col, sig_col = sig_col, gene_col = 'geneSymbol', extra_cols = extra_cols, coordinate_type=coordinate_type, start_coordinate_system = '0-based', taskbar_label = "3' ASS events", separate_modification_types=separate_modification_types, PROCESSES = threeASS_processes) threeASS_ptms['Event Type'] = '3ASS' spliced_ptms.append(threeASS_ptms) #identify ptms with altered flanking sequences if identify_flanking_sequences: print("Identifying flanking sequences for 3' ASS events.") threeASS_flanks = fs.get_flanking_changes_from_splice_data(A3SS_events, ptm_coordinates, chromosome_col = 'chr', strand_col = 'strand', spliced_region_start_col = 'event_start', spliced_region_end_col = 'event_end', first_flank_start_col = 'first_flank_start', first_flank_end_col = 'first_flank_end', second_flank_start_col = 'second_flank_start', second_flank_end_col = 'second_flank_end', dPSI_col=dPSI_col, sig_col = dPSI_col, gene_col = 'geneSymbol', event_id_col = 'AS ID', extra_cols = extra_cols, coordinate_type=coordinate_type, start_coordinate_system='0-based') threeASS_flanks['Event Type'] = '3ASS' spliced_flanks.append(threeASS_flanks) else: print("3' ASS event data (A3SS_events) not provided, skipping") if RI_events is not None: if RI_events['chr'].str.contains('chr').any(): RI_events['chr'] = RI_events['chr'].apply(lambda x: x[3:]) #add event id RI_events['AS ID'] = "RI_" + RI_events.index.astype(str) #check to make sure there is enough information to do multiprocessing if that is desired if PROCESSES*4 > RI_events.shape[0]: RI_processes = 1 else: RI_processes = PROCESSES spliced_events['RI'], RI_ptms = project_ptms_onto_splice_events(RI_events, annotate_original_df=True, ptm_coordinates = ptm_coordinates, chromosome_col = 'chr', strand_col = 'strand', region_start_col = 'upstreamEE', region_end_col = 'downstreamES', event_id_col = 'AS ID', dPSI_col=dPSI_col, sig_col = sig_col, gene_col = 'geneSymbol', coordinate_type=coordinate_type, start_coordinate_system='0-based', extra_cols = extra_cols, taskbar_label = 'Retained Intron Events', separate_modification_types=separate_modification_types, PROCESSES = RI_processes) RI_ptms['Event Type'] = 'RI' spliced_ptms.append(RI_ptms) #identify ptms with altered flanking sequences if identify_flanking_sequences: print('Identifying flanking sequences for retained intron events.') RI_flanks = fs.get_flanking_changes_from_splice_data(RI_events, ptm_coordinates, chromosome_col = 'chr', strand_col = 'strand', spliced_region_start_col = 'upstreamEE', spliced_region_end_col = 'downstreamES', first_flank_start_col = 'upstreamES', first_flank_end_col = 'upstreamEE', second_flank_start_col = 'downstreamES', second_flank_end_col = 'downstreamEE', dPSI_col=dPSI_col, sig_col = sig_col, gene_col = 'geneSymbol', event_id_col = 'AS ID', extra_cols = extra_cols, coordinate_type=coordinate_type, start_coordinate_system='0-based') RI_flanks['Event Type'] = 'RI' spliced_flanks.append(RI_flanks) if MXE_events is not None: if MXE_events['chr'].str.contains('chr').any(): MXE_events['chr'] = MXE_events['chr'].apply(lambda x: x[3:]) #check to make sure there is enough information to do multiprocessing if that is desired if PROCESSES*4 > MXE_events.shape[0]: MXE_processes = 1 else: MXE_processes = PROCESSES #add AS ID MXE_events['AS ID'] = "MXE_" + MXE_events.index.astype(str) mxe_ptms = [] #first mxe exon spliced_events['MXE_Exon1'], MXE_Exon1_ptms = project_ptms_onto_splice_events(MXE_events, annotate_original_df=True, ptm_coordinates = ptm_coordinates, chromosome_col = 'chr', strand_col = 'strand', region_start_col = '1stExonStart_0base', region_end_col = '1stExonEnd', event_id_col = 'AS ID', dPSI_col=dPSI_col, sig_col = sig_col, gene_col = 'geneSymbol', coordinate_type=coordinate_type, start_coordinate_system = '0-based', taskbar_label = 'MXE, First Exon', extra_cols=extra_cols, separate_modification_types=separate_modification_types, PROCESSES = MXE_processes) MXE_Exon1_ptms['Event Type'] = 'MXE (First Exon)' mxe_ptms.append(MXE_Exon1_ptms) #second mxe exon spliced_events['MXE_Exon2'], MXE_Exon2_ptms = project_ptms_onto_splice_events(MXE_events, annotate_original_df=True, ptm_coordinates = ptm_coordinates, chromosome_col = 'chr', strand_col = 'strand', region_start_col = '2ndExonStart_0base', region_end_col = '2ndExonEnd', event_id_col = 'AS ID', dPSI_col=dPSI_col, sig_col = sig_col, gene_col = 'geneSymbol', extra_cols=extra_cols, coordinate_type=coordinate_type, start_coordinate_system='0-based', taskbar_label = 'MXE, Second Exon', separate_modification_types=separate_modification_types, PROCESSES = MXE_processes) MXE_Exon2_ptms['Event Type'] = 'MXE (Second Exon)' mxe_ptms.append(MXE_Exon2_ptms) #combine mxe ptms, and then drop any PTMs that were found in both MXE's mxe_ptms = pd.concat([MXE_Exon1_ptms, MXE_Exon2_ptms]) columns_to_check = ['UniProtKB Accession', 'Source of PTM', 'Residue', 'PTM Position in Isoform', 'Modification', 'Modification Class', 'Gene'] if dPSI_col is not None: columns_to_check.append('dPSI') if sig_col is not None: columns_to_check.append('Significance') if extra_cols is not None: columns_to_check += extra_cols mxe_ptms = mxe_ptms.drop_duplicates(subset = columns_to_check, keep = False) #flip dPSI values for second exon if dPSI_col is not None: mxe_ptms['dPSI'] = mxe_ptms.apply(lambda x: x['dPSI']* -1 if x['Event Type'] == 'MXE (Second Exon)' else x['dPSI'], axis = 1) #add mxe ptms to spliced_ptms spliced_ptms.append(mxe_ptms) spliced_ptms = pd.concat(spliced_ptms) if identify_flanking_sequences: spliced_flanks = pd.concat(spliced_flanks) return spliced_events, spliced_ptms, spliced_flanks else: return spliced_events, spliced_ptms #def project_ptms_onto_MAJIQ_dPSI(majiq_data, ptm_coordinates = None, coordinate_type = 'hg38', identify_flanking_sequences = False, dPSI_col = 'dPSI', sig_col = 'FDR', separate_modification_types = False, PROCESSES = 1): # print('in progress') # pass def add_splicegraph_info(psi_data, splicegraph, purpose = 'inclusion'): psi_data = psi_data[psi_data['splice_type'] != 'ME'].copy() if purpose == 'inclusion': #split exons into individual exons psi_data['Individual exon'] = psi_data['exons'].apply(lambda x: x.split(':')) psi_data = psi_data.explode('Individual exon').drop_duplicates() psi_data['Individual exon'] = psi_data['Individual exon'].astype(float) #add gene location information to psi data from spliceseq psi_data = psi_data.merge(splicegraph, left_on = ['symbol', 'Individual exon'], right_on = ['Symbol', 'Exon'], how = 'left') psi_data = psi_data.rename(columns = {'Chr_Start': 'spliced_region_start', 'Chr_Stop': 'spliced_region_end'}) return psi_data elif purpose == 'flanking': print('Not yet active. Please check back later.') else: raise ValueError('Purpose must be either inclusion or flanking. Please provide the correct purpose for the splicegraph information.') def project_ptms_onto_SpliceSeq(psi_data, splicegraph, gene_col ='symbol', dPSI_col = None, sig_col = None, extra_cols = None, coordinate_type = 'hg19', separate_modification_types = False, identify_flanking_sequences = False, flank_size = 5, ptm_coordinates = None, PROCESSES = 1, **kwargs): """ Given splice event quantification from SpliceSeq (such as what can be downloaded from TCGASpliceSeq), annotate with PTMs that are found in the differentially included regions. Parameters ---------- psi_data: pandas.DataFrame dataframe containing splice event quantification from SpliceSeq. Must contain the following columns: 'symbol', 'exons', 'splice_type'. splicegraph: pandas.DataFrame dataframe containing exon information from the splicegraph used during splice event quantification. Must contain the following columns: 'Symbol', 'Exon', 'Chr_Start', 'Chr_Stop'. gene_col: str column name in psi_data that contains the gene name. Default is 'symbol'. dPSI_col: str column name in psi_data that contains the delta PSI value for the splice event. Default is None, which will not include this information in the output. sig_col: str column name in psi_data that contains the significance value for the splice event. Default is None, which will not include this information in the output. extra_cols: list list of additional columns to include in the output dataframe. Default is None, which will not include any additional columns. coordinate_type: str indicates the coordinate system used for the start and end positions. Either hg38 or hg19. Default is 'hg19'. separate_modification_types: bool Indicate whether to store PTM sites with multiple modification types as multiple rows. For example, if a site at K100 was both an acetylation and methylation site, these will be separated into unique rows with the same site number but different modification types. Default is True. identify_flanking_sequences: bool Indicate whether to identify and return the flanking sequences for the splice events. Default is False. flank_size: int Size of the flanking sequence to extract from the splice event. Default is 5, which will extract 5 bases upstream and downstream of the splice event. Only relevant if identify_flanking_sequences is True. PROCESSES: int Number of processes to use for multiprocessing. Default is 1 (single processing). **kwargs: additional keyword arguments Additional keyword arguments to pass to the find_ptms_in_many_regions function, which will be fed into the `filter_ptms()` function from the helper module. These will be used to filter ptms with lower evidence. For example, if you want to filter PTMs based on the number of MS observations, you can add 'min_MS_observations = 2' to the kwargs. This will filter out any PTMs that have less than 2 MS observations. See the `filter_ptms()` function for more options. """ #load ptm data from config if not provided if ptm_coordinates is None: ptm_coordinates = pose_config.ptm_coordinates.copy() #check for any keyword arguments to use for filtering if kwargs: filter_arguments = helpers.extract_filter_kwargs(**kwargs) #check any excess unused keyword arguments, report them helpers.check_filter_kwargs(filter_arguments) #filter ptm coordinates file to include only ptms with desired evidence ptm_coordinates = helpers.filter_ptms(ptm_coordinates, **filter_arguments) #remove ME events from this analysis overlapping_columns = set(psi_data.columns).intersection({'Chromosome', 'Strand', 'Chr_Start', 'Chr_Stop'}) if len(overlapping_columns) > 0: #drop columns that will be added from splicegraph psi_data = psi_data.drop(columns=overlapping_columns) print('Removing ME events from analysis') spliced_data = psi_data.copy() spliced_data = spliced_data[spliced_data['splice_type'] != 'ME'].copy() #split exons into individual exons spliced_data['Individual exon'] = spliced_data['exons'].apply(lambda x: x.split(':')) spliced_data = spliced_data.explode('Individual exon').drop_duplicates() spliced_data['Individual exon'] = spliced_data['Individual exon'].astype(float) #add gene location information to psi data from spliceseq spliced_data = spliced_data.merge(splicegraph.copy(), left_on = ['symbol', 'Individual exon'], right_on = ['Symbol', 'Exon'], how = 'left') spliced_data = spliced_data.rename(columns = {'Chr_Start': 'spliced_region_start', 'Chr_Stop': 'spliced_region_end'}) print('Projecting PTMs onto SpliceSeq data') spliced_data, spliced_ptms = project_ptms_onto_splice_events(spliced_data, chromosome_col = 'Chromosome', strand_col = 'Strand', gene_col = 'symbol', region_start_col = 'spliced_region_start', region_end_col = 'spliced_region_end', event_id_col = 'as_id',dPSI_col = dPSI_col, sig_col = sig_col, extra_cols = extra_cols, separate_modification_types = separate_modification_types, coordinate_type = coordinate_type, PROCESSES = PROCESSES) ## add code for extracting flanking sequences (to do) if identify_flanking_sequences: altered_flanks = fs.get_flanking_changes_from_splicegraph(psi_data, splicegraph, dPSI_col = dPSI_col, sig_col = sig_col, extra_cols = extra_cols, gene_col = gene_col, coordinate_type=coordinate_type, flank_size = flank_size) return spliced_data, spliced_ptms, altered_flanks else: return spliced_data, spliced_ptms #def project_ptms_onto_TCGA_SpliceSeq(tcga_cancer = 'PRAD'): # """ # In progress. Will download and process TCGA SpliceSeq data for a specific cancer type, and project PTMs onto the spliced regions. # """ # print('Not yet active. Please check back later.') # pass def check_columns(splice_data, chromosome_col = None, strand_col = None, region_start_col = None, region_end_col = None, first_flank_start_col = None, first_flank_end_col = None, second_flank_start_col = None, second_flank_end_col = None, gene_col = None, dPSI_col = None, sig_col = None, event_id_col = None, extra_cols = None): """ Function to quickly check if the provided column names exist in the dataset and if they are the correct type of data """ expected_cols = [chromosome_col, strand_col, region_start_col, region_end_col, first_flank_start_col, first_flank_end_col, second_flank_start_col, second_flank_end_col, gene_col, dPSI_col, sig_col, event_id_col] expected_dtypes = [[str, object], [str,int, object], [int,float], [int,float], [int,float], [int,float], [int,float], [int,float], [str, object], float, float, None] #remove cols with None and the corresponding dtype entry expected_dtypes = [dtype for col, dtype in zip(expected_cols, expected_dtypes) if col is not None] expected_cols = [col for col in expected_cols if col is not None] #add extra columns to the expected columns list if extra_cols is not None: expected_cols += extra_cols expected_dtypes += [None]*len(extra_cols) #extra columns do not have dtype requirement #check to make sure columns exist in the dataframe if not all([x in splice_data.columns for x in expected_cols]): raise ValueError('Not all expected columns are present in the splice data. Please check the column names and provide the correct names for the following columns: {}'.format([x for x in expected_cols if x not in splice_data.columns])) #check to make sure columns are the correct data type for col, data_type in zip(expected_cols, expected_dtypes): if data_type is None: continue elif isinstance(data_type, list): if splice_data[col].dtype not in data_type: #try converting to the expected data type try: splice_data[col] = splice_data[col].astype(data_type[0]) except: raise ValueError('Column {} is not the expected data type. Expected data type is one of {}, but found data type {}'.format(col, data_type, splice_data[col].dtype)) else: if splice_data[col].dtype != data_type: #try converting to the expected data type try: splice_data[col] = splice_data[col].astype(data_type) except: raise ValueError('Column {} is not the expected data type. Expected data type is {}, but found data type {}'.format(col, data_type, splice_data[col].dtype))
07-09
检查代码中的错误,是否合理,是否冗余:import os import sys import re import json import gc import time import concurrent.futures import traceback import numpy as np import librosa import torch import psutil from typing import List, Dict, Tuple, Optional from threading import RLock, Semaphore from pydub import AudioSegment from pydub.silence import split_on_silence from pydub.utils import get_encoder_name_extension, make_chunks from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from transformers import AutoModelForSequenceClassification, AutoTokenizer from torch.utils.data import TensorDataset, DataLoader from PyQt5.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QLabel, QLineEdit, QTextEdit, QFileDialog, QProgressBar, QGroupBox, QMessageBox, QListWidget, QSplitter, QTabWidget, QTableWidget, QTableWidgetItem, QHeaderView, QAction, QMenu, QToolBar, QComboBox, QSpinBox, QDialog, QDialogButtonBox) from PyQt5.QtCore import QThread, pyqtSignal, Qt from PyQt5.QtGui import QFont, QColor, QIcon # ====================== 工具函数 ====================== def check_ffmpeg_available() -> Tuple[bool, str]: """检查ffmpeg是否可用并返回检查结果和说明""" try: # 尝试加载一个空的音频片段来触发ffmpeg检查 test_audio = AudioSegment.empty() # 尝试导出到一个常见格式 test_format = 'wav' encoder = get_encoder_name_extension(test_format) if not encoder: return False, f"未找到{test_format}格式的编码器,请确保ffmpeg已正确安装" return True, "ffmpeg已正确安装并可用" except FileNotFoundError: return False, "未找到ffmpeg程序,请安装ffmpeg并确保其在系统PATH中" except Exception as e: return False, f"ffmpeg检查失败: {str(e)}" # ====================== 资源监控器 ====================== class ResourceMonitor: def __init__(self): self.gpu_available = torch.cuda.is_available() def memory_percent(self) -> Dict[str, float]: try: result = {"cpu": psutil.virtual_memory().percent} if self.gpu_available: allocated = torch.cuda.memory_allocated() / (1024 ** 3) total = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) result["gpu"] = (allocated / total) * 100 if total > 0 else 0 return result except Exception as e: print(f"内存监控失败: {str(e)}") return {"cpu": 0, "gpu": 0} # ====================== 方言处理器(简化版) ====================== class DialectProcessor: # 合并贵州方言和普通话关键词 KEYWORDS = { "opening": ["您好", "很高兴为您服务", "请问有什么可以帮您", "麻烦您喽", "请问搞哪样", "有咋个可以帮您", "多谢喽"], "closing": ["感谢来电", "祝您生活愉快", "再见", "搞归一喽", "麻烦您喽", "再见喽", "慢走喽"], "forbidden": ["不知道", "没办法", "你投诉吧", "随便你", "搞不成", "没得法", "随便你喽", "你投诉吧喽"], "salutation": ["先生", "女士", "小姐", "老师", "师傅", "哥", "姐", "兄弟", "妹儿"], "reassurance": ["非常抱歉", "请不要着急", "我们会尽快处理", "理解您的心情", "实在对不住", "莫急哈", "马上帮您整", "理解您得很"] } # 贵州方言到普通话的固定映射 DIALECT_MAPPING = { "恼火得很": "非常生气", "鬼火戳": "很愤怒", "搞不成": "无法完成", "没得": "没有", "搞哪样嘛": "做什么呢", "归一喽": "完成了", "咋个": "怎么", "克哪点": "去哪里", "麻烦您喽": "麻烦您了", "多谢喽": "多谢了", "憨包": "傻瓜", "归一": "结束", "板扎": "很好", "鬼火冒": "非常生气", "背时": "倒霉", "吃豁皮": "占便宜" } # Trie树根节点 _trie_root = None class TrieNode: def __init__(self): self.children = {} self.is_end = False self.value = "" @classmethod def build_dialect_trie(cls): """构建方言转换的Trie树""" if cls._trie_root is not None: return cls._trie_root root = cls.TrieNode() # 按长度降序排序,确保最长匹配优先 for dialect, standard in sorted(cls.DIALECT_MAPPING.items(), key=lambda x: len(x[0]), reverse=True): node = root for char in dialect: if char not in node.children: node.children[char] = cls.TrieNode() node = node.children[char] node.is_end = True node.value = standard cls._trie_root = root return root @classmethod def preprocess_text(cls, texts: List[str]) -> List[str]: """使用Trie树进行方言转换""" if cls._trie_root is None: cls.build_dialect_trie() processed_texts = [] for text in texts: processed = [] i = 0 n = len(text) while i < n: node = cls._trie_root j = i found = False # 在Trie树中查找最长匹配 while j < n and text[j] in node.children: node = node.children[text[j]] j += 1 if node.is_end: # 找到完整匹配 processed.append(node.value) i = j found = True break if not found: # 无匹配 processed.append(text[i]) i += 1 processed_texts.append(''.join(processed)) return processed_texts # ====================== 系统配置管理器 ====================== class ConfigManager: _instance = None def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) cls._instance._init_config() return cls._instance def _init_config(self): self.config = { "model_paths": { "asr": "./models/iic-speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn", "sentiment": "./models/IDEA-CCNL-Erlangshen-Roberta-110M-Sentiment" }, "sample_rate": 16000, "silence_thresh": -40, "min_silence_len": 1000, "max_concurrent": 1, "max_audio_duration": 3600 # 移除了方言配置 } self.load_config() def load_config(self): try: if os.path.exists("config.json"): with open("config.json", "r", encoding="utf-8") as f: self.config.update(json.load(f)) except json.JSONDecodeError: print("配置文件格式错误,使用默认配置") except Exception as e: print(f"加载配置失败: {str(e)},使用默认配置") def save_config(self): try: with open("config.json", "w", encoding="utf-8") as f: json.dump(self.config, f, indent=2, ensure_ascii=False) except Exception as e: print(f"保存配置失败: {str(e)}") def get(self, key: str, default=None): return self.config.get(key, default) def set(self, key: str, value): self.config[key] = value self.save_config() def check_model_paths(self) -> Tuple[bool, List[str]]: """检查模型路径是否有效""" errors = [] model_paths = self.get("model_paths", {}) for model_name, path in model_paths.items(): if not path: errors.append(f"{model_name}模型路径未设置") elif not os.path.exists(path): errors.append(f"{model_name}模型路径不存在: {path}") elif not os.path.isdir(path): errors.append(f"{model_name}模型路径不是有效的目录: {path}") return len(errors) == 0, errors # ====================== 音频处理工具 ====================== class AudioProcessor: SUPPORTED_FORMATS = ('.mp3', '.wav', '.amr', '.m4a') @staticmethod def check_dependencies(): """检查音频处理所需的依赖""" return check_ffmpeg_available() @staticmethod def convert_to_wav(input_path: str, temp_dir: str) -> Optional[List[str]]: # 先检查ffmpeg是否可用 ffmpeg_available, ffmpeg_msg = check_ffmpeg_available() if not ffmpeg_available: print(f"ffmpeg错误: {ffmpeg_msg}") return None try: os.makedirs(temp_dir, exist_ok=True) ext = os.path.splitext(input_path)[1].lower() if ext not in AudioProcessor.SUPPORTED_FORMATS: raise ValueError(f"不支持的音频格式: {ext},支持的格式为: {', '.join(AudioProcessor.SUPPORTED_FORMATS)}") if ext == '.wav': return [input_path] # 尝试加载音频文件 try: audio = AudioSegment.from_file(input_path) except Exception as e: raise RuntimeError(f"无法加载音频文件: {str(e)}。请确认文件未损坏且ffmpeg支持该格式。") max_duration = ConfigManager().get("max_audio_duration", 3600) * 1000 if len(audio) > max_duration: return AudioProcessor._split_long_audio(audio, input_path, temp_dir) return AudioProcessor._convert_single_audio(audio, input_path, temp_dir) except Exception as e: print(f"格式转换失败: {str(e)}") return None @staticmethod def _split_long_audio(audio: AudioSegment, input_path: str, temp_dir: str) -> List[str]: chunks = split_on_silence( audio, min_silence_len=ConfigManager().get("min_silence_len", 1000), silence_thresh=ConfigManager().get("silence_thresh", -40), keep_silence=500 ) merged_chunks = [] current_chunk = AudioSegment.empty() for chunk in chunks: if len(current_chunk) + len(chunk) < 5 * 60 * 1000: # 5分钟 current_chunk += chunk else: if len(current_chunk) > 0: merged_chunks.append(current_chunk) current_chunk = chunk if len(current_chunk) > 0: merged_chunks.append(current_chunk) wav_paths = [] sample_rate = ConfigManager().get("sample_rate", 16000) for i, chunk in enumerate(merged_chunks): chunk = chunk.set_frame_rate(sample_rate).set_channels(1) chunk_path = os.path.join(temp_dir, f"{os.path.splitext(os.path.basename(input_path))[0]}_part{i + 1}.wav") chunk.export(chunk_path, format="wav") wav_paths.append(chunk_path) return wav_paths @staticmethod def _convert_single_audio(audio: AudioSegment, input_path: str, temp_dir: str) -> List[str]: sample_rate = ConfigManager().get("sample_rate", 16000) audio = audio.set_frame_rate(sample_rate).set_channels(1) wav_path = os.path.join(temp_dir, os.path.splitext(os.path.basename(input_path))[0] + ".wav") audio.export(wav_path, format="wav") return [wav_path] @staticmethod def extract_features_from_audio(y: np.ndarray, sr: int) -> Dict[str, float]: try: duration = librosa.get_duration(y=y, sr=sr) segment_length = 60 total_segments = max(1, int(np.ceil(duration / segment_length))) syllable_rates, volume_stabilities = [], [] total_samples = len(y) samples_per_segment = int(segment_length * sr) for i in range(total_segments): start = i * samples_per_segment end = min((i + 1) * samples_per_segment, total_samples) y_segment = y[start:end] if len(y_segment) == 0: continue intervals = librosa.effects.split(y_segment, top_db=20) speech_samples = sum(end - start for start, end in intervals) speech_duration = speech_samples / sr syllable_rates.append(len(intervals) / speech_duration if speech_duration > 0.1 else 0) rms = librosa.feature.rms(y=y_segment, frame_length=2048, hop_length=512)[0] if len(rms) > 0 and np.mean(rms) > 0: volume_stabilities.append(np.std(rms) / np.mean(rms)) return { "duration": duration, "syllable_rate": round(np.mean([r for r in syllable_rates if r > 0]) if syllable_rates else 0, 2), "volume_stability": round(np.mean(volume_stabilities) if volume_stabilities else 0, 4) } except Exception as e: print(f"特征提取错误: {str(e)}") return {"duration": 0, "syllable_rate": 0, "volume_stability": 0} # ====================== 模型加载器 ====================== class ModelLoader: asr_pipeline = None sentiment_model = None sentiment_tokenizer = None model_lock = RLock() models_loaded = False @classmethod def load_models(cls): config = ConfigManager() # 先检查模型路径是否有效 paths_valid, errors = config.check_model_paths() if not paths_valid: raise ValueError(f"模型路径无效:\n{chr(10).join(errors)}") if not cls.asr_pipeline: with cls.model_lock: if not cls.asr_pipeline: cls._load_asr_model(config.get("model_paths")["asr"]) if not cls.sentiment_model: with cls.model_lock: if not cls.sentiment_model: cls._load_sentiment_model(config.get("model_paths")["sentiment"]) cls.models_loaded = True @classmethod def reload_models(cls): with cls.model_lock: cls.asr_pipeline = None cls.sentiment_model = None cls.sentiment_tokenizer = None gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() cls.load_models() @classmethod def _load_asr_model(cls, model_path: str): try: if not os.path.exists(model_path): raise FileNotFoundError(f"ASR模型路径不存在: {model_path}") asr_kwargs = {'quantize': 'int8'} if hasattr(torch, 'quantization') else {} cls.asr_pipeline = pipeline( task=Tasks.auto_speech_recognition, model=model_path, device='cuda' if torch.cuda.is_available() else 'cpu',** asr_kwargs ) except Exception as e: print(f"加载ASR模型失败: {str(e)}") raise @classmethod def _load_sentiment_model(cls, model_path: str): try: if not os.path.exists(model_path): raise FileNotFoundError(f"情感分析模型路径不存在: {model_path}") cls.sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_path) cls.sentiment_tokenizer = AutoTokenizer.from_pretrained(model_path) if torch.cuda.is_available(): cls.sentiment_model = cls.sentiment_model.cuda() except Exception as e: print(f"加载情感分析模型失败: {str(e)}") raise # ====================== 核心分析线程(简化版) ====================== class AnalysisThread(QThread): progress_updated = pyqtSignal(int, str, str) result_ready = pyqtSignal(dict) finished_all = pyqtSignal() error_occurred = pyqtSignal(str, str) memory_warning = pyqtSignal() resource_cleanup = pyqtSignal() def __init__(self, audio_paths: List[str], temp_dir: str = "temp_wav"): super().__init__() self.audio_paths = audio_paths self.temp_dir = temp_dir self.is_running = True self.current_file = "" self.max_concurrent = min(ConfigManager().get("max_concurrent", 1), self._get_max_concurrent_tasks()) self.resource_monitor = ResourceMonitor() self.semaphore = Semaphore(self.max_concurrent) os.makedirs(temp_dir, exist_ok=True) def run(self): try: # 检查ffmpeg是否可用 ffmpeg_available, ffmpeg_msg = check_ffmpeg_available() if not ffmpeg_available: self.error_occurred.emit("音频处理依赖缺失", f"无法处理音频: {ffmpeg_msg}\n\n请安装ffmpeg并确保其在系统PATH中。\nWindows用户可从https://ffmpeg.org/download.html下载并添加到环境变量。") return if not ModelLoader.models_loaded: self.error_occurred.emit("模型未加载", "请等待模型加载完成后再开始分析") return self.progress_updated.emit(0, f"最大并行任务数: {self.max_concurrent}", "") with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_concurrent) as executor: future_to_path = {} for path in self.audio_paths: if not self.is_running: break self.semaphore.acquire() future = executor.submit(self.analyze_audio, path, self._get_available_batch_size()) future_to_path[future] = path future.add_done_callback(lambda f: self.semaphore.release()) for i, future in enumerate(concurrent.futures.as_completed(future_to_path)): if not self.is_running: break path = future_to_path[future] self.current_file = os.path.basename(path) if self._check_memory_usage(): self.memory_warning.emit() self.is_running = False break try: result = future.result() if result: self.result_ready.emit(result) progress = int((i + 1) / len(self.audio_paths) * 100) self.progress_updated.emit(progress, f"完成: {self.current_file} ({i + 1}/{len(self.audio_paths)})", self.current_file) except Exception as e: result = {"file_name": self.current_file, "status": "error", "error": f"分析失败: {str(e)}"} self.result_ready.emit(result) if self.is_running: self.finished_all.emit() except Exception as e: self.error_occurred.emit("系统错误", str(e)) traceback.print_exc() finally: self.resource_cleanup.emit() self._cleanup_resources() def analyze_audio(self, audio_path: str, batch_size: int) -> Dict: result = {"file_name": os.path.basename(audio_path), "status": "processing"} wav_paths = [] try: wav_paths = AudioProcessor.convert_to_wav(audio_path, self.temp_dir) if not wav_paths: result["error"] = "格式转换失败,请检查文件是否损坏或格式是否支持" result["status"] = "error" return result audio_features = self._extract_audio_features(wav_paths) result.update(audio_features) result["duration_str"] = self._format_duration(audio_features["duration"]) all_segments, full_text = self._process_asr_segments(wav_paths) agent_segments, customer_segments = self._identify_speakers(all_segments) result["asr_text"] = self._generate_labeled_text(all_segments, agent_segments, customer_segments).strip() text_analysis = self._analyze_text(agent_segments, customer_segments, batch_size) result.update(text_analysis) service_check = self._check_service_rules(agent_segments) result.update(service_check) result["issue_resolved"] = self._check_issue_resolution(customer_segments, agent_segments) result["status"] = "success" except Exception as e: result["error"] = f"分析失败: {str(e)}" result["status"] = "error" finally: self._cleanup_temp_files(wav_paths) self._cleanup_resources() return result def _identify_speakers(self, segments: List[Dict]) -> Tuple[List[Dict], List[Dict]]: """使用四层逻辑识别客服""" if not segments: return [], [] # 逻辑1:前三片段开场白关键词 agent_id = self._identify_by_opening(segments) # 逻辑2:后三片段结束语关键词 if agent_id is None: agent_id = self._identify_by_closing(segments) # 逻辑3:称呼与敬语关键词 if agent_id is None: agent_id = self._identify_by_salutation(segments) # 逻辑4:安抚语关键词 if agent_id is None: agent_id = self._identify_by_reassurance(segments) # 后备策略:说话模式识别 if agent_id is None and len(segments) >= 4: agent_id = self._identify_by_speech_patterns(segments) if agent_id is None: # 最后手段:选择说话最多的说话人 spk_counts = {} for seg in segments: spk_id = seg["spk_id"] spk_counts[spk_id] = spk_counts.get(spk_id, 0) + 1 agent_id = max(spk_counts, key=spk_counts.get) if spk_counts else None if agent_id is None: return [], [] return ( [seg for seg in segments if seg["spk_id"] == agent_id], [seg for seg in segments if seg["spk_id"] != agent_id] ) def _identify_by_opening(self, segments: List[Dict]) -> Optional[str]: """逻辑1:前三片段开场白关键词""" keywords = DialectProcessor.KEYWORDS["opening"] for seg in segments[:3]: if any(kw in seg["text"] for kw in keywords): return seg["spk_id"] return None def _identify_by_closing(self, segments: List[Dict]) -> Optional[str]: """逻辑2:后三片段结束语关键词""" keywords = DialectProcessor.KEYWORDS["closing"] last_segments = segments[-3:] if len(segments) >= 3 else segments for seg in reversed(last_segments): if any(kw in seg["text"] for kw in keywords): return seg["spk_id"] return None def _identify_by_salutation(self, segments: List[Dict]) -> Optional[str]: """逻辑3:称呼与敬语关键词""" keywords = DialectProcessor.KEYWORDS["salutation"] for seg in segments: if any(kw in seg["text"] for kw in keywords): return seg["spk_id"] return None def _identify_by_reassurance(self, segments: List[Dict]) -> Optional[str]: """逻辑4:安抚语关键词""" keywords = DialectProcessor.KEYWORDS["reassurance"] for seg in segments: if any(kw in seg["text"] for kw in keywords): return seg["spk_id"] return None def _identify_by_speech_patterns(self, segments: List[Dict]) -> Optional[str]: """后备策略:说话模式识别""" speaker_features = {} for seg in segments: spk_id = seg["spk_id"] if spk_id not in speaker_features: speaker_features[spk_id] = {"total_duration": 0.0, "turn_count": 0, "question_count": 0} features = speaker_features[spk_id] features["total_duration"] += (seg["end"] - seg["start"]) features["turn_count"] += 1 if any(q_word in seg["text"] for q_word in ["吗", "呢", "?", "?", "如何", "怎样"]): features["question_count"] += 1 if speaker_features: max_duration = max(f["total_duration"] for f in speaker_features.values()) question_rates = {spk_id: f["question_count"] / f["turn_count"] for spk_id, f in speaker_features.items()} candidates = [] for spk_id, features in speaker_features.items(): score = (0.6 * (features["total_duration"] / max_duration) + 0.4 * question_rates[spk_id]) candidates.append((spk_id, score)) return max(candidates, key=lambda x: x[1])[0] return None def _analyze_text(self, agent_segments: List[Dict], customer_segments: List[Dict], batch_size: int) -> Dict: """优化情感分析方法""" def split_long_sentences(texts: List[str]) -> List[str]: splitted = [] for text in texts: if len(text) > 128: parts = re.split(r'(?<=[。!?;,])', text) current = "" for part in parts: if len(current) + len(part) < 128: current += part else: if current: splitted.append(current) current = part if current: splitted.append(current) else: splitted.append(text) return splitted def enhance_with_keywords(texts: List[str]) -> List[str]: enhanced = [] emotion_keywords = { "positive": ["满意", "高兴", "感谢", "专业", "解决", "帮助", "谢谢", "很好", "不错"], "negative": ["生气", "愤怒", "不满", "投诉", "问题", "失望", "差劲", "糟糕", "投诉"], "neutral": ["了解", "明白", "知道", "确认", "查询", "记录", "需要", "提供"] } for text in texts: found_emotion = None for emotion, keywords in emotion_keywords.items(): if any(kw in text for kw in keywords): found_emotion = emotion break if found_emotion: enhanced.append(f"[{found_emotion}] {text}") else: enhanced.append(text) return enhanced # 分析单个说话者 def analyze_speaker(segments: List[Dict], speaker_type: str) -> Dict: if not segments: return { f"{speaker_type}_negative": 0.0, f"{speaker_type}_neutral": 1.0, f"{speaker_type}_positive": 0.0, f"{speaker_type}_emotions": "无" } texts = [seg["text"] for seg in segments] processed_texts = DialectProcessor.preprocess_text(texts) splitted_texts = split_long_sentences(processed_texts) enhanced_texts = enhance_with_keywords(splitted_texts) with ModelLoader.model_lock: inputs = ModelLoader.sentiment_tokenizer( enhanced_texts, padding=True, truncation=True, max_length=128, return_tensors="pt" ) dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask']) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False) device = "cuda" if torch.cuda.is_available() else "cpu" sentiment_dist = [] emotions = [] for batch in dataloader: input_ids, attention_mask = batch inputs = {'input_ids': input_ids.to(device), 'attention_mask': attention_mask.to(device)} with torch.no_grad(): outputs = ModelLoader.sentiment_model(**inputs) batch_probs = torch.nn.functional.softmax(outputs.logits, dim=-1) sentiment_dist.append(batch_probs.cpu()) emotion_keywords = ["愤怒", "生气", "鬼火", "不耐烦", "搞哪样嘛", "恼火", "背时", "失望", "不满"] for text in enhanced_texts: if any(kw in text for kw in emotion_keywords): if any(kw in text for kw in ["愤怒", "生气", "鬼火", "恼火"]): emotions.append("愤怒") elif any(kw in text for kw in ["不耐烦", "搞哪样嘛"]): emotions.append("不耐烦") elif "背时" in text: emotions.append("沮丧") elif any(kw in text for kw in ["失望", "不满"]): emotions.append("失望") if sentiment_dist: all_probs = torch.cat(sentiment_dist, dim=0) avg_sentiment = torch.mean(all_probs, dim=0).tolist() else: avg_sentiment = [0.0, 1.0, 0.0] return { f"{speaker_type}_negative": round(avg_sentiment[0], 4), f"{speaker_type}_neutral": round(avg_sentiment[1], 4), f"{speaker_type}_positive": round(avg_sentiment[2], 4), f"{speaker_type}_emotions": ",".join(set(emotions)) if emotions else "无" } return {** analyze_speaker(agent_segments, "agent"), **analyze_speaker(customer_segments, "customer") } def _check_service_rules(self, agent_segments: List[Dict]) -> Dict: keywords = DialectProcessor.KEYWORDS found_forbidden = [] found_opening = any(kw in seg["text"] for seg in agent_segments[:3] for kw in keywords["opening"]) found_closing = any( kw in seg["text"] for seg in (agent_segments[-3:] if len(agent_segments) >= 3 else agent_segments) for kw in keywords["closing"]) for seg in agent_segments: for kw in keywords["forbidden"]: if kw in seg["text"]: found_forbidden.append(kw) break return { "opening_found": found_opening, "closing_found": found_closing, "forbidden_words": ", ".join(set(found_forbidden)) if found_forbidden else "无" } def _check_issue_resolution(self, customer_segments: List[Dict], agent_segments: List[Dict]) -> bool: if not customer_segments or not agent_segments: return False resolution_keywords = ["解决", "处理", "完成", "已", "好了", "可以了", "没问题", "明白", "清楚", "满意", "行"] unresolved_keywords = ["没解决", "不行", "不对", "还是", "仍然", "再", "未", "无法", "不能", "不行", "不满意"] negation_words = ["不", "没", "未", "非", "无"] gratitude_keywords = ["谢谢", "感谢", "多谢", "麻烦", "辛苦", "有劳"] full_conversation = " ".join(seg["text"] for seg in customer_segments + agent_segments) last_customer_text = customer_segments[-1]["text"] for kw in unresolved_keywords: if kw in full_conversation: negation_context = re.search(rf".{{0,5}}{kw}", full_conversation) if negation_context: context = negation_context.group(0) if not any(neg in context for neg in negation_words): return False else: return False if any(kw in last_customer_text for kw in gratitude_keywords): if not any(neg + kw in last_customer_text for neg in negation_words): return True for agent_text in [seg["text"] for seg in agent_segments[-3:]]: if any(kw in agent_text for kw in resolution_keywords): if not any(neg in agent_text for neg in negation_words): return True for cust_seg in customer_segments[-2:]: if any(kw in cust_seg["text"] for kw in ["好", "行", "可以", "明白"]): if not any(neg in cust_seg["text"] for neg in negation_words): return True if any("?" in seg["text"] or "?" in seg["text"] for seg in customer_segments[-2:]): return False return False # ====================== 辅助方法 ====================== def _get_available_batch_size(self) -> int: if not torch.cuda.is_available(): return 4 total_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) per_task_mem = total_mem / self.max_concurrent return 2 if per_task_mem < 2 else 4 if per_task_mem < 4 else 8 def _get_max_concurrent_tasks(self) -> int: if torch.cuda.is_available(): total_mem = torch.cuda.get_device_properties(0).total_memory / (1024 ** 3) return 1 if total_mem < 6 else 2 if total_mem < 12 else 3 return max(1, os.cpu_count() // 2) def _check_memory_usage(self) -> bool: try: mem_percent = self.resource_monitor.memory_percent() return mem_percent.get("cpu", 0) > 85 or mem_percent.get("gpu", 0) > 85 except: return False def _extract_audio_features(self, wav_paths: List[str]) -> Dict[str, float]: combined_y = np.array([], dtype=np.float32) sr = ConfigManager().get("sample_rate", 16000) for path in wav_paths: y, _ = librosa.load(path, sr=sr) combined_y = np.concatenate((combined_y, y)) return AudioProcessor.extract_features_from_audio(combined_y, sr) def _process_asr_segments(self, wav_paths: List[str]) -> Tuple[List[Dict], str]: segments = [] full_text = "" batch_size = min(4, len(wav_paths), self._get_available_batch_size()) for i in range(0, len(wav_paths), batch_size): if not self.is_running: break batch_paths = wav_paths[i:i + batch_size] try: results = ModelLoader.asr_pipeline(batch_paths, output_dir=None, batch_size=batch_size) for result in results: for seg in result[0]["sentences"]: segments.append({ "start": seg["start"], "end": seg["end"], "text": seg["text"], "spk_id": seg.get("spk_id", "0") }) full_text += seg["text"] + " " except Exception as e: print(f"ASR批处理错误: {str(e)}") for path in batch_paths: try: result = ModelLoader.asr_pipeline(path, output_dir=None) for seg in result[0]["sentences"]: segments.append({ "start": seg["start"], "end": seg["end"], "text": seg["text"], "spk_id": seg.get("spk_id", "0") }) full_text += seg["text"] + " " except: continue return segments, full_text.strip() def _generate_labeled_text(self, all_segments: List[Dict], agent_segments: List[Dict], customer_segments: List[Dict]) -> str: agent_spk_id = agent_segments[0]["spk_id"] if agent_segments else None customer_spk_id = customer_segments[0]["spk_id"] if customer_segments else None labeled_text = [] for seg in all_segments: if seg["spk_id"] == agent_spk_id: speaker = "客服" elif seg["spk_id"] == customer_spk_id: speaker = "客户" else: speaker = f"说话人{seg['spk_id']}" labeled_text.append(f"[{speaker}]: {seg['text']}") return "\n".join(labeled_text) def _cleanup_temp_files(self, paths: List[str]): def safe_remove(path): if os.path.exists(path): try: os.remove(path) except: pass for path in paths: safe_remove(path) now = time.time() for file in os.listdir(self.temp_dir): file_path = os.path.join(self.temp_dir, file) if os.path.isfile(file_path) and (now - os.path.getmtime(file_path)) > 3600: safe_remove(file_path) def _format_duration(self, seconds: float) -> str: minutes, seconds = divmod(int(seconds), 60) hours, minutes = divmod(minutes, 60) return f"{hours:02d}:{minutes:02d}:{seconds:02d}" def _cleanup_resources(self): gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() def stop(self): self.is_running = False # ====================== 模型加载线程 ====================== class ModelLoadThread(QThread): progress_updated = pyqtSignal(int, str) finished = pyqtSignal(bool, str) def run(self): try: config = ConfigManager() # 先检查模型路径是否有效 paths_valid, errors = config.check_model_paths() if not paths_valid: self.finished.emit(False, f"模型路径无效:\n{chr(10).join(errors)}") return self.progress_updated.emit(20, "加载语音识别模型...") ModelLoader._load_asr_model(config.get("model_paths")["asr"]) self.progress_updated.emit(60, "加载情感分析模型...") ModelLoader._load_sentiment_model(config.get("model_paths")["sentiment"]) self.progress_updated.emit(100, "模型加载完成") self.finished.emit(True, "模型加载成功") except Exception as e: self.finished.emit(False, f"模型加载失败: {str(e)}") # ====================== GUI主界面(简化版) ====================== class MainWindow(QMainWindow): def __init__(self): super().__init__() self.setWindowTitle("贵州方言客服质检系统") self.setGeometry(100, 100, 1200, 800) self.setup_ui() self.setup_menu() self.analysis_thread = None self.model_load_thread = None self.temp_dir = "temp_wav" os.makedirs(self.temp_dir, exist_ok=True) self.model_loaded = False # 初始化时检查依赖和模型配置 self.check_initial_setup() def setup_ui(self): main_widget = QWidget() main_layout = QVBoxLayout() main_widget.setLayout(main_layout) self.setCentralWidget(main_widget) toolbar = QToolBar("主工具栏") self.addToolBar(toolbar) actions = [ ("添加文件", "icons/add.png", self.add_files), ("开始分析", "icons/start.png", self.start_analysis), ("停止分析", "icons/stop.png", self.stop_analysis), ("设置", "icons/settings.png", self.open_settings) ] for name, icon, func in actions: action = QAction(QIcon(icon), name, self) action.triggered.connect(func) toolbar.addAction(action) splitter = QSplitter(Qt.Horizontal) main_layout.addWidget(splitter) left_widget = QWidget() left_layout = QVBoxLayout() left_widget.setLayout(left_layout) left_layout.addWidget(QLabel("待分析文件列表")) self.file_list = QListWidget() self.file_list.setSelectionMode(QListWidget.ExtendedSelection) left_layout.addWidget(self.file_list) right_widget = QWidget() right_layout = QVBoxLayout() right_widget.setLayout(right_layout) right_layout.addWidget(QLabel("分析进度")) self.progress_bar = QProgressBar() self.progress_bar.setRange(0, 100) right_layout.addWidget(self.progress_bar) self.current_file_label = QLabel("当前文件: 无") right_layout.addWidget(self.current_file_label) self.tab_widget = QTabWidget() right_layout.addWidget(self.tab_widget, 1) text_tab = QWidget() text_layout = QVBoxLayout() text_tab.setLayout(text_layout) self.text_result = QTextEdit() self.text_result.setReadOnly(True) text_layout.addWidget(self.text_result) self.tab_widget.addTab(text_tab, "文本结果") detail_tab = QWidget() detail_layout = QVBoxLayout() detail_tab.setLayout(detail_layout) self.result_table = QTableWidget() self.result_table.setColumnCount(10) self.result_table.setHorizontalHeaderLabels([ "文件名", "时长", "语速", "音量稳定性", "客服情感", "客户情感", "开场白", "结束语", "禁用词", "问题解决" ]) self.result_table.horizontalHeader().setSectionResizeMode(QHeaderView.Stretch) detail_layout.addWidget(self.result_table) self.tab_widget.addTab(detail_tab, "详细结果") splitter.addWidget(left_widget) splitter.addWidget(right_widget) splitter.setSizes([300, 900]) def setup_menu(self): menu_bar = self.menuBar() file_menu = menu_bar.addMenu("文件") file_actions = [ ("添加文件", self.add_files), ("导出结果", self.export_results), ("退出", self.close) ] for name, func in file_actions: action = QAction(name, self) action.triggered.connect(func) file_menu.addAction(action) analysis_menu = menu_bar.addMenu("分析") analysis_actions = [ ("开始分析", self.start_analysis), ("停止分析", self.stop_analysis) ] for name, func in analysis_actions: action = QAction(name, self) action.triggered.connect(func) analysis_menu.addAction(action) settings_menu = menu_bar.addMenu("设置") settings_actions = [ ("系统配置", self.open_settings), ("加载模型", self.load_models) ] for name, func in settings_actions: action = QAction(name, self) action.triggered.connect(func) settings_menu.addAction(action) def check_initial_setup(self): """检查初始设置,包括依赖和模型路径""" # 检查ffmpeg ffmpeg_available, ffmpeg_msg = check_ffmpeg_available() if not ffmpeg_available: QMessageBox.critical( self, "音频处理依赖缺失", f"无法处理音频: {ffmpeg_msg}\n\n请安装ffmpeg并确保其在系统PATH中。\nWindows用户可从https://ffmpeg.org/download.html下载并添加到环境变量。" ) # 检查模型路径 config = ConfigManager() paths_valid, errors = config.check_model_paths() if not paths_valid: msg = QMessageBox() msg.setIcon(QMessageBox.Warning) msg.setText("模型路径配置不正确") msg.setInformativeText(f"检测到以下问题:\n{chr(10).join(errors)}\n\n是否现在进行配置?") msg.setWindowTitle("配置模型路径") msg.setStandardButtons(QMessageBox.Yes | QMessageBox.No) if msg.exec_() == QMessageBox.Yes: self.open_settings() def add_files(self): files, _ = QFileDialog.getOpenFileNames( self, "选择音频文件", "", "音频文件 (*.mp3 *.wav *.amr *.m4a)" ) for file in files: self.file_list.addItem(file) def start_analysis(self): # 先检查ffmpeg是否可用 ffmpeg_available, ffmpeg_msg = check_ffmpeg_available() if not ffmpeg_available: QMessageBox.critical( self, "音频处理依赖缺失", f"无法开始分析: {ffmpeg_msg}\n\n请安装ffmpeg并确保其在系统PATH中。\nWindows用户可从https://ffmpeg.org/download.html下载并添加到环境变量。" ) return if self.file_list.count() == 0: QMessageBox.warning(self, "警告", "请先添加要分析的音频文件") return # 检查模型路径 config = ConfigManager() paths_valid, errors = config.check_model_paths() if not paths_valid: msg = QMessageBox() msg.setIcon(QMessageBox.Warning) msg.setText("模型路径配置不正确") msg.setInformativeText(f"检测到以下问题:\n{chr(10).join(errors)}\n\n是否现在进行配置?") msg.setWindowTitle("配置模型路径") msg.setStandardButtons(QMessageBox.Yes | QMessageBox.No) if msg.exec_() == QMessageBox.Yes: self.open_settings() # 再次检查配置 paths_valid, _ = config.check_model_paths() if not paths_valid: return else: return if not self.model_loaded: # 询问是否加载模型 reply = QMessageBox.question( self, "模型未加载", "模型尚未加载,是否立即加载?", QMessageBox.Yes | QMessageBox.No, QMessageBox.Yes ) if reply == QMessageBox.Yes: self.load_models() # 等待模型加载完成 return else: return audio_paths = [self.file_list.item(i).text() for i in range(self.file_list.count())] self.text_result.clear() self.result_table.setRowCount(0) self.analysis_thread = AnalysisThread(audio_paths, self.temp_dir) self.analysis_thread.progress_updated.connect(self.update_progress) self.analysis_thread.result_ready.connect(self.handle_result) self.analysis_thread.finished_all.connect(self.analysis_finished) self.analysis_thread.error_occurred.connect(self.show_error) self.analysis_thread.memory_warning.connect(self.handle_memory_warning) self.analysis_thread.start() def stop_analysis(self): if self.analysis_thread and self.analysis_thread.isRunning(): self.analysis_thread.stop() self.analysis_thread.wait() QMessageBox.information(self, "信息", "分析已停止") def load_models(self): # 先检查模型路径 config = ConfigManager() paths_valid, errors = config.check_model_paths() if not paths_valid: msg = QMessageBox() msg.setIcon(QMessageBox.Warning) msg.setText("模型路径配置不正确") msg.setInformativeText(f"检测到以下问题:\n{chr(10).join(errors)}\n\n是否现在进行配置?") msg.setWindowTitle("配置模型路径") msg.setStandardButtons(QMessageBox.Yes | QMessageBox.No) if msg.exec_() == QMessageBox.Yes: self.open_settings() # 再次检查配置 paths_valid, _ = config.check_model_paths() if not paths_valid: return else: return if self.model_load_thread and self.model_load_thread.isRunning(): return self.model_load_thread = ModelLoadThread() self.model_load_thread.progress_updated.connect(lambda value, _: self.progress_bar.setValue(value)) self.model_load_thread.finished.connect(self.handle_model_load_result) self.model_load_thread.start() def update_progress(self, progress: int, message: str, current_file: str): self.progress_bar.setValue(progress) self.current_file_label.setText(f"当前文件: {current_file}") def handle_result(self, result: Dict): if result["status"] == "success": self.text_result.append( f"文件: {result['file_name']}\n状态: {result['status']}\n时长: {result['duration_str']}") self.text_result.append( f"语速: {result['syllable_rate']} 音节/秒\n音量稳定性: {result['volume_stability']}") self.text_result.append( f"客服情感: 负面({result['agent_negative']:.2%}) 中性({result['agent_neutral']:.2%}) 正面({result['agent_positive']:.2%})") self.text_result.append(f"客服情绪: {result['agent_emotions']}") self.text_result.append( f"客户情感: 负面({result['customer_negative']:.2%}) 中性({result['customer_neutral']:.2%}) 正面({result['customer_positive']:.2%})") self.text_result.append(f"客户情绪: {result['customer_emotions']}") self.text_result.append( f"开场白: {'有' if result['opening_found'] else '无'}\n结束语: {'有' if result['closing_found'] else '无'}") self.text_result.append( f"禁用词: {result['forbidden_words']}\n问题解决: {'是' if result['issue_resolved'] else '否'}") self.text_result.append("\n=== 对话文本 ===\n" + result["asr_text"] + "\n" + "=" * 50 + "\n") row = self.result_table.rowCount() self.result_table.insertRow(row) items = [ result["file_name"], result["duration_str"], str(result["syllable_rate"]), str(result["volume_stability"]), f"负:{result['agent_negative']:.2f} 中:{result['agent_neutral']:.2f} 正:{result['agent_positive']:.2f}", f"负:{result['customer_negative']:.2f} 中:{result['customer_neutral']:.2f} 正:{result['customer_positive']:.2f}", "是" if result["opening_found"] else "否", "是" if result["closing_found"] else "否", result["forbidden_words"], "是" if result["issue_resolved"] else "否" ] for col, text in enumerate(items): item = QTableWidgetItem(text) if col in [6, 7] and text == "否": item.setBackground(QColor(255, 200, 200)) if col == 8 and text != "无": item.setBackground(QColor(255, 200, 200)) if col == 9 and text == "否": item.setBackground(QColor(255, 200, 200)) self.result_table.setItem(row, col, item) elif result["status"] == "error": self.text_result.append(f"文件: {result['file_name']}\n状态: 错误\n原因: {result['error']}\n" + "=" * 50 + "\n") def analysis_finished(self): QMessageBox.information(self, "完成", "所有音频分析完成") self.progress_bar.setValue(100) def show_error(self, title: str, message: str): QMessageBox.critical(self, title, message) def handle_memory_warning(self): QMessageBox.warning(self, "内存警告", "内存使用过高,分析已停止") def handle_model_load_result(self, success: bool, message: str): if success: self.model_loaded = True QMessageBox.information(self, "成功", message) else: QMessageBox.critical(self, "错误", message) def open_settings(self): settings_dialog = QDialog(self) settings_dialog.setWindowTitle("系统设置") settings_dialog.setFixedSize(500, 300) layout = QVBoxLayout() config = ConfigManager().get("model_paths") settings = [ ("ASR模型路径:", config["asr"], self.browse_directory), ("情感模型路径:", config["sentiment"], self.browse_directory) ] for label, value, func in settings: h_layout = QHBoxLayout() h_layout.addWidget(QLabel(label)) line_edit = QLineEdit(value) browse_btn = QPushButton("浏览...") browse_btn.clicked.connect(lambda _, le=line_edit: func(le)) h_layout.addWidget(line_edit) h_layout.addWidget(browse_btn) layout.addLayout(h_layout) spin_settings = [ ("最大并发任务:", "max_concurrent", 1, 8), ("最大音频时长(秒):", "max_audio_duration", 60, 86400) ] for label, key, min_val, max_val in spin_settings: h_layout = QHBoxLayout() h_layout.addWidget(QLabel(label)) spin_box = QSpinBox() spin_box.setRange(min_val, max_val) spin_box.setValue(ConfigManager().get(key, min_val)) h_layout.addWidget(spin_box) layout.addLayout(h_layout) button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel) button_box.accepted.connect(settings_dialog.accept) button_box.rejected.connect(settings_dialog.reject) layout.addWidget(button_box) settings_dialog.setLayout(layout) if settings_dialog.exec_() == QDialog.Accepted: # 保存模型路径配置 ConfigManager().set("model_paths", { "asr": layout.itemAt(0).layout().itemAt(1).widget().text(), "sentiment": layout.itemAt(1).layout().itemAt(1).widget().text() }) # 保存其他配置 ConfigManager().set("max_concurrent", layout.itemAt(2).layout().itemAt(1).widget().value()) ConfigManager().set("max_audio_duration", layout.itemAt(3).layout().itemAt(1).widget().value()) # 重新加载模型 if self.model_loaded: reply = QMessageBox.question( self, "配置已更新", "模型路径已更改,是否立即重新加载模型?", QMessageBox.Yes | QMessageBox.No, QMessageBox.Yes ) if reply == QMessageBox.Yes: self.load_models() def browse_directory(self, line_edit): path = QFileDialog.getExistingDirectory(self, "选择目录") if path: line_edit.setText(path) def export_results(self): if self.result_table.rowCount() == 0: QMessageBox.warning(self, "警告", "没有可导出的结果") return path, _ = QFileDialog.getSaveFileName(self, "保存结果", "", "CSV文件 (*.csv)") if not path: return try: with open(path, "w", encoding="utf-8") as f: headers = [self.result_table.horizontalHeaderItem(col).text() for col in range(self.result_table.columnCount())] f.write(",".join(headers) + "\n") for row in range(self.result_table.rowCount()): row_data = [self.result_table.item(row, col).text() for col in range(self.result_table.columnCount())] # 处理包含逗号的文本 row_data = [f'"{data}"' if ',' in data else data for data in row_data] f.write(",".join(row_data) + "\n") QMessageBox.information(self, "成功", f"结果已导出到: {path}") except Exception as e: QMessageBox.critical(self, "错误", f"导出失败: {str(e)}") def closeEvent(self, event): if self.analysis_thread and self.analysis_thread.isRunning(): self.analysis_thread.stop() self.analysis_thread.wait() try: for file in os.listdir(self.temp_dir): file_path = os.path.join(self.temp_dir, file) if os.path.isfile(file_path): for _ in range(3): try: os.remove(file_path); break except: time.sleep(0.1) os.rmdir(self.temp_dir) except: pass event.accept() # ====================== 程序入口 ====================== if __name__ == "__main__": torch.set_num_threads(4) app = QApplication(sys.argv) app.setStyle('Fusion') window = MainWindow() window.show() sys.exit(app.exec_())
最新发布
08-05
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值