最近,一个学医的同学问我说,想找到RNA在人和老鼠身上都显性表达较好的序列(具体 医学用语我不懂),但是序列都太多了,问我能不能编个程序找出表达性较好的序列。然后就开始试着写了。
她告诉我她知道phylonocode上有所有有关人RNA序列和老鼠RNA序列的信息。于是我去下载了,结果发现,其中很重要的一项指标只能在网页上一条条地查看,并不包括在下载的信息中,几万条啊,简直是坑。于是我先用个爬虫将这一信息爬下来,再综合到信息列表中。
import urllib2
from bs4 import BeautifulSoup
import re
import urlparse
#num=0
class SpiderMain(object):
def __init__(self):
self.downloader=HtmlDownloader()
self.parser=HtmlParser()
self.outputer=HtmlOutputer()
def craw(self,root_url):
html_cont=self.downloader.download(root_url)
new_data=self.parser.parse(root_url,html_cont)
# if new_data is not None:
# num+=1
self.outputer.collect_data(new_data)
self.outputer.output_html()
class HtmlDownloader(object):
def download(self,url):
if url is None:
return None
response =urllib2.urlopen(url)
if response.getcode()!=200:
return None
return response.read()
class HtmlParser(object):
def _get_new_data(self,page_url,soup):
res_data={}
res_data['url']=page_url
title_node=soup.find('font',style=re.compile('font-size'))
if title_node!=None:
res_data['name']=title_node.get_text()[38:]
score_node=soup.find_all('tr',style=re.compile("background"))
if score_node!=None:
for single_node in score_node:
if single_node.get_text().find("Gene Symbol")==0:
res_data["Gene Symbol"]=single_node.get_text()[11:];
return res_data
def parse(self,page_url,html_cont):
if page_url is None or html_cont is None:
return
soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
new_data=self._get_new_data(page_url,soup)
return new_data
class HtmlOutputer(object):
def __init__(self):
self.datas=[]
def collect_data(self,data):
if data is None:
return
self.datas.append(data)
def output_html(self):
f=open("mouse.txt",'a')
for data in self.datas:
f.write("%s " % data["name"])
f.write("%s " % data["Gene Symbol"])
f.write("\n")
if __name__=="__main__":
i=1
while(i!=42559):
a=str(i);
for j in range(0,5-len(a)):
a='0'+a
# root_url="http://www.bioinfo.org/phyloNoncode/gene.php?ID=PNCG_HSA0%s"%a
root_url="http://www.bioinfo.org/phyloNoncode/gene.php?ID=PNCG_MMU0%s"%a
obj_spider=SpiderMain()
obj_spider.craw(root_url)
i=int(a)
i=i+1
# print num
随后考虑将爬下来的信息与原有的信息融合在一起。开始考虑写程序实现,后来同学说vim直接就可以编辑。然后就愉快地搞定了。
之后从中帅选在人的RNA列在小鼠身上也表达性良好的,且Gene Symbol有记录的项。程序如下:
#include <iostream>
#include <string>
#include <stdlib.h>
#include <fstream>
#include <sstream>
#include <boost/lexical_cast.hpp>
using namespace std;
using namespace boost;
#define BUF_SIZE 256
int main()
{
FILE *file1=NULL;
FILE *file2=NULL;
int humanratnum=0;
if((file1=fopen("mousebase.txt","r"))==NULL)
{
cout<<"open mousebase.txt error"<<endl;
exit(1);
}
if((file2=fopen("mousehuman.txt","a+"))==NULL)
{
cout<<"open mousehuman.txt error"<<endl;
exit(1);
}
char buf[BUF_SIZE];
string str,str1,str2,str3,str4,str5,str6,str7,str8,str9,str10,str11,str12;
stringstream ss;
string humanmouse="";
while(fgets(buf,BUF_SIZE,file1)!=NULL&&buf[0]!='\n')
{
str=buf;
ss<<str;
ss>>str1;
ss>>str2;
ss>>str3;
ss>>str4;
ss>>str5;
ss>>str6;
ss>>str7;
ss>>str8;
ss>>str9;
ss>>str10;
ss>>str11;
ss>>str12;
if(str2!="NA"&&str3!="-"&&lexical_cast<double>(str3)>=0.9)
{
humanmouse=str1+" "+str2+" "+str3;
fprintf(file2,"%s\n",humanmouse.c_str());
humanratnum++;
}
// cout<<str1<<" "<<str2<<" "<<str3<<" "<<str4<<" "<<str5<<" "
// <<str6<<" "<<str7<<" "<<str8<<" "<<str9<<" "<<str10<<" "<<str11<<" "<<str12<<endl;
}
cout<<humanratnum<<endl;
}
最后对比,人和老鼠中RNA Gene Symbol相同的序列。
#include <iostream>
#include <string>
#include <stdlib.h>
#include <fstream>
#include <sstream>
#include <map>
#include <vector>
using namespace std;
using namespace boost;
#define BUF_SIZE 126
int main()
{
FILE *file1=NULL;
FILE *file2=NULL;
FILE *file3=NULL;
int humanratnum=0;
if((file1=fopen("humanmouse.txt","r"))==NULL)
{
cout<<"open humanmouse.txt error"<<endl;
exit(1);
}
if((file2=fopen("mousehuman.txt","r"))==NULL)
{
cout<<"open mousehuman.txt error"<<endl;
exit(1);
}
if((file3=fopen("combinehumanmouse.txt","a+"))==NULL)
{
cout<<"open combinehumanmouse.txt error"<<endl;
exit(1);
}
char buf[BUF_SIZE];
string str,str1,str2,str3,str4;
stringstream ss;
string humanmouse="";
map<string,vector<string> > map1;
map<string,vector<string> >::iterator iter;
while(fgets(buf,BUF_SIZE,file1)!=NULL&&buf[0]!='\n')
{
str=buf;
ss<<str;
ss>>str1;
ss>>str2;
ss>>str3;
str4=str1+" "+str3;
map1[str2].push_back(str4);
}
while(fgets(buf,BUF_SIZE,file2)!=NULL&&buf[0]!='\n')
{
str=buf;
ss<<str;
ss>>str1;
ss>>str2;
ss>>str3;
str4=str1+" "+str3;
map1[str2].push_back(str4);
}
int i;
for(iter=map1.begin();iter!=map1.end();++iter)
{
if((iter->second).size()>=2)
{
for(i=0;i<(iter->second).size();i++)
{
humanmouse+=(iter->second)[i]+" ";
}
fprintf(file3,"%s\n",humanmouse.c_str());
humanmouse="";
humanratnum++;
}
}
cout<<humanratnum<<endl;
}

本文介绍了一个用于筛选人和老鼠RNA序列中表达性良好且GeneSymbol有记录的项的程序。该程序首先通过爬虫获取相关数据,然后进行数据融合及筛选。
7万+

被折叠的 条评论
为什么被折叠?



