记录一些常用的Python 代码功能, 便于查找.-优快云博客

记录一些常用的Python 代码功能, 便于查找.

# jupyter notebook 是否显示Dataframe的所有行和列
import pandas as pd
#pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

#忽略warning 信息
import warnings 
warnings.filterwarnings('ignore')

#显示多行输出信息
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

#常用设定
import numpy as np
import os
import sys
import math
import matplotlib.pyplot as plt
import seaborn as sns
import re

#设定工作目录
sys.path.append("/home/workdir/01XXXX")   #当前位置就是("./")

#导入自己写的代码或函数
#从当前工作目录开始的, 导入 script/ranking.py 里定义的def readin, 或者导入整个ranking代码 
from script.ranking import readin
import scripts.ranking as rk
#从 utils.py 导入def AAAA , 或导入整个utils代码
from .utils import AAAA
import utils

读入文件和基本操作

#读取csv 文件
path = '../data/test.csv'
df= pd.read_csv(path)  #sep='\t'  

#连续读取文件处理
file_list=["1.csv","2.csv","3.csv"]
for i in range(0,len(file_list)):
    if i == 0:
        df=pd.read_csv('data/' + output_file_list[i])
    else:
        df2=pd.read_csv('data/' +output_file_list[i])
        df = pd.concat([df,df2,axis=0)  #行合并

#更改列名
df = df.rename(columns={"Unnamed: 0": "id"})
#特定列求和
df['score']=df[select_features].apply(lambda x:x.sum(),axis = 1)
#排序
df = df.sort_values('score',ascending = False)
#index处理
df.reset_index(drop=True,inplace=True)

#数字处理
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets, decomposition,preprocessing
#to numeric
X = df[sel_features].to_numpy()
#normalization 0-1 标准化
scaler = preprocessing.MinMaxScaler().fit(X)

#替换
line = line.replace("\n", "")

#同步批量替换
def reverse(line):   # for reverse read, need to 反向互补
    seq=""
    dict = {"A":"T","T":"A","G":"C","C":"G","N":"N"}
    for i in range(0,len(line)):
        seq+=dict[line[i]]
    return str(seq[::-1])

#查找与检索
line.find(">")  #找不到就返回-1, 找到就返回位置,line是str
seq2 = re.findall(r"MA[A-Z]{9,15}GGSA",seq)   #返回“ ” 中全部内容
seq2 = re.findall(r"MA([A-Z]{9,15})GGSA",seq)   #返回()中内容

#数据记录
peptides=[]  #或者="" ,输出格式会不一样,都试试
……
if line: #一顿处理后的line仍不为空
    peptides.append(line)
或者
    peptides.extend(line)

#分割数据
f=NGS_path+file_list[0].split('/')[-1][:-9]+".fasta"

#转换成dataframe 或list
data.to_frame()
df.age.values.tolist()

#新建列带变量i
df[str(i)+'_Count'] = df['A'] + df['B']

#求中位数平均数 np
np.median(df['score'])

#数据判断条件分组
#cutoff=200
df.loc[df['score']>=cutoff,'Group'] = "High"
df.loc[df['score']<cutoff,'Group'] = "Low"

行列提取

#按某列信息提取行
df = df[df.Name == "demo"]  
df = df[df['means']>0.47]  #df[df.means > 0.47]?  
df = df[(df['LogA']>-5) & (df['LogC']<8)] 

#提取特定行或列
df.loc[3:6]   #提取3-6行
data.loc[[idx]]  #按照索引提取行, data.loc[idx:idx]

df.iloc[:,3:6]   #提取3-6列
df[[‘col_name1','col_names2']]

提取特定元素
df.loc[2][3]   #提取2行3列的元素
#df.loc[行索引,列名]
df.loc[2:4,['col_names1','col_names2']]
#df.loc[行位置,列位置]
df.iloc[2:4,1:3]   #提取2、3行的1、2列元素

#删除特定行或列
df.drop(['Amy','Bod'])   #按行名(index)来删除, 默认 axis =0
df.drop(df.index[[1,3,5]])   #按行号删除

df.drop('name',axis = 1)
df.drop(columns =['name','age'],axis = 1) #按列名来删除
df.drop(df.columns[[1,3]])   #按列号删除

NA处理

df = df.dropna(axis=0,how='any')    #删除带na的所有行
df = df.dropna(axis=1,how='any')    #删除带na的所有列
df = df.dropna(axis=0,how='any')    #删除所有都是na的行

df = df.fillna(0)   #NA 变为0

快速合并两个dataframe

# 省时间的方法, 先找好共有的index, 再组成新的dataframe
sample = list(set(A["CCLE_Name"]).intersection(set(B.index)))   #一个是index, 一个是CCLE_Name 这一列
A.loc[A["CCLE_Name"].isin(sample)].reset_index(drop=True)   #提取A中共有index的行

# 直接拼接两个数据集, 列合并
pd.concat([B.reset_index(drop=True),A.reindex(A["CCLE_Name"].values).reset_index(drop=True)],axis=1)    #shorten the calculate time

sel_features = list(set(Type_1).union(set(Type_2)))  #union 并集, intersaction 交集
pd.concat([df.iloc[:, :1],df[sel_features]],axis=1)    #df.iloc[:, :1] 第一列


# 也可以merge, 但比较耗时,小数据集可以进行这个操作
pd.merge(A,B.loc[:,['Entity ID',"linker",KD]],how="left",on='id')   #how = left/right/inner/outer

输出内容/文件

print("output file: %s" % i)
print("Selecting", len(sel_features)+1, "indicators:",sel_features)

df.to_csv("gene__drug__cor.csv") 

out_file = str(col_list[0]) +'_' + str(col_list[-1]) + '.csv'
cor.to_csv(out_file)

#屏幕输入的output csv file 
if (args.OUT!="N"):
    df_out[['Entity ID','A','B']].to_csv(args.OUT, index=False)

#屏幕输入的output jpg file 
if (PLOT=="Y"):
        images_path="./"
        plt.figure(dpi=120,figsize=(17,6))
        plt.rc('y-axis',labelsize="8") #y轴坐标
        plt.rc('x-axis',labelsize="8")
        (plot) 
        plt.savefig("out.plot.jpg",bbox_inches='tight')

屏幕输入和函数调用

import argparse
parser = argparse.ArgumentParser(description='argparse testing')
parser.add_argument('--IN','-i',type=str,required=True,help="please clarify the input file path")
parser.add_argument('--ROUND','-r',type=int,required=True,help="which round of peptides you would like to analysis?")
arser.add_argument('--PLOT','-p',type=str,default="N",required=False,help="Plot or not")
parser.add_argument('--OUT','-o',type=str,default="N",required=False,help="Output file name")

#printing in screen
args = parser.parse_args()
print("##############STARTING###############")
print("Running time:",datetime.now())
print("Input file:",args.IN)


def func(file,KD,r,p):
   ……
   return(list,df)

lists,df_out = func(args.IN,args.KD,args.ROUND)

统计分析

#统计元素频率
pd.value_counts(dat.gender)

##求cor 
#1 vs 多列
from scipy.stats import pearsonr,spearmanr
cor = df.iloc[:,j:].apply(lambda x:spearmanr(x,df.iloc[:,i]),result_type="expand")  #pearsonr
#matrix
cor = dat.corr(method='spearman', min_periods=1, numeric_only=False)

执行shell 命令

import os

NGS_path = "./data_vIII/fastq_gz/" 
command = "ls %s*.fastq.gz" % (NGS_path)
#file_list = os.popen(command).read().split("\n")
file_list =[i for i in os.popen(command).read().split("\n") if i != '']

command = "mkdir -p %sQC/" % (NGS_path)
os.system(command)

数数

#超大文件不要轻易用count啊
def count(read_list):
    countDict =dict()
    
    total = len(read_list)
    #remove duplicate ”peptides_list“ firstly
    #count peptides, can't use count function directly, time consuming: O(n^2)
    for pep in read_list:
        #countDict[pep]= read_list.count(pep)    #don't use count
        if pep not in countDict.keys():
            countDict[pep] =1
        else: 
            countDict[pep] += 1 
    #sort out into a dataframe
    PEP_count=pd.DataFrame.from_dict(countDict,orient='index',columns=['Count'])
    PEP_count['Library fraction'] = (PEP_count['Count']/total)
    PEP_count = PEP_count.sort_values("Library fraction",ascending = False)
    return PEP_count

多CPU并行

import multiprocessing 
from multiprocessing import Process


def run_multi_rounds(file):
    f=NGS_path+file.split('/')[-1][:-11]+"_1.fasta"   #path to forward(_1) fasta file
    r=NGS_path+file.split('/')[-1][:-11]+"_2.fasta"   #path to reverse(_2) fasta file
    f_peptides = match_pattern(fa2AA(f))              #translation & extract peptides seq.
    r_peptides = match_pattern(fa2AA(r))
    peptides_count= count_paired_end_fraction(f_peptides,r_peptides)   #peptide counts
    df = pd.DataFrame(peptides_count)
    out_file = NGS_path+f[-35:][:6] + '_peptide_fraction.csv'    #output file name, for each round
    df.to_csv(out_file)
    print("output file: %s" % out_file)
    
from multiprocessing import Process
# run each round as the same time
# file_list 0,2,4 for each round has two files(_1;_2).
def run_Process():
    process = [multiprocessing.Process(target=run_multi_rounds,args=(file_list[0],)),
            multiprocessing.Process(target=run_multi_rounds,args=(file_list[2],)),
            multiprocessing.Process(target=run_multi_rounds,args=(file_list[4],)),
           ]
    [p.start() for p in process]
    [p.join() for p in process]
    
    
if __name__ =='__main__':
    run_Process()

PCA 查看 PC 重要性

X=df[sel_features].to_numpy()
i=df[sel_features].columns.values.tolist()
scaler = preprocessing.MinMaxScaler().fit(X)。 #自行normalization
X = scaler.transform(X)

#PCA heatmap
pc=10
pca = decomposition.PCA(n_components=pc)
pca.fit(X.T)
print(pca.explained_variance_ratio_)    #每个PC的占比 (可以画落石图)
X_pca = pca.transform(X.T)
#PLOTING
df_p = pd.DataFrame(X_pca, index = i,columns=['PC%s' % i for i in range (1,pc+1)])   #["PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8"]
plt.figure(dpi=120)
plt.rc('ytick',labelsize="6")
plt.rc('xtick',labelsize="6")
sns.heatmap(data=df_p,cmap=sns.diverging_palette(100,200, sep=50, n=17),vmin=-2,vmax=2,yticklabels=True)    #vmin=-2,vmax=2 限定了颜色均等.diverging_palette的”100“,”200“可以调整颜色

画图

from matplotlib import pyplot as plt

### 散点图
plt.plot(df['score'],'o')
plt.xlabel('ID')  #index
plt.ylabel('score')
plt.show()

###线性图
sns.regplot(x="age",y="score", ci=0.95, data=df)

###热图
scaler = preprocessing.StandardScaler().fit(X)。                  #自己normalization
X = scaler.transform(X)
df_p = pd.DataFrame(X, index = df['ID'],columns=sel_features)    #组成新的dataframe
plt.figure(dpi=120)
plt.rc('ytick',labelsize="6")
plt.rc('xtick',labelsize="6")
sns.heatmap(data=df_p.T,cmap=sns.cubehelix_palette(as_cmap=True),yticklabels=True,xticklabels=True)

df_p.head(2) 样本在行,特征在列

样本在x轴,特征在y轴

#箱型图。 #多图拼在一张图
ind_list = ['Wang', 'Martins','AZ']
df_p = df[ind_list,'Group']

images_path="./"
my_dpi=96
plt.figure(figsize=(1000/my_dpi, 1000/my_dpi), dpi=my_dpi)
plt.rc('ytick',labelsize="8")
plt.rc('xtick',labelsize="8")
for i in range(0,len(ind_list)):   #每一个ind,先计算ttest/utest, 再画图
    #mannwhitneyu(g1[indicator],g2[indicator],use_continuity=True,alternative='less').pvalue
    #mannwhitneyu(g1[indicator],g2[indicator],use_continuity=True,alternative='greater').pvalue
    #stats.ttest_ind(g1[indicator], g2[indicator])
    
    # Loop to plot
    plt.subplot(len(ind_list),4,i+1)
    sns.boxplot(x="Group",y=ind_list[i],data=df_p,hue="Group",order=["Low","High"], palette="Blues") 
    sns.swarmplot(x="Group",y=ind_list[i],data=df_p,color=".25")  #加点
plt.tight_layout()       
plt.savefig("plot_output.jpg")