|
发表于 2017-2-15 10:51:12
|
显示全部楼层
本帖最后由 AdaWon 于 2017-2-15 11:00 编辑
147-python
#本次任务是:gff3/gtf注释文件的探究
#选用 Arabidopsis 注释文件
1.查看注释文件;
$ less TAIR10_GFF3_genes.gff
Chr1 TAIR10 chromosome 1 30427671 . . . ID=Chr1;Name=Chr1
Chr1 TAIR10 gene 3631 5899 . + . ID=AT1G01010;Note=protein_coding_gene;Name=AT1G01010
Chr1 TAIR10 mRNA 3631 5899 . + . ID=AT1G01010.1; Parent=AT1G01010;Name=AT1G01010.1;Index=1
Chr1 TAIR10 protein 3760 5630 . + . ID=AT1G01010.1-Protein;Name=AT1G01010.1;Derives_from=AT1G01010.1
Chr1 TAIR10 exon 3631 3913 . + . Parent=AT1G01010.1
Chr1 TAIR10 five_prime_UTR 3631 3759 . + . Parent=AT1G01010.1
Chr1 TAIR10 CDS 3760 3913 . + 0 Parent=AT1G01010.1,AT1G01010.1-Protein;
Chr1 TAIR10 exon 3996 4276 . + . Parent=AT1G01010.1
Chr1 TAIR10 CDS 3996 4276 . + 2 Parent=AT1G01010.1,AT1G01010.1-Protein;
Chr1 TAIR10 exon 4486 4605 . + . Parent=AT1G01010.1
Chr1 TAIR10 CDS 4486 4605 . + 0 Parent=AT1G01010.1,AT1G01010.1-Protein;
Chr1 TAIR10 exon 4706 5095 . + . Parent=AT1G01010.1
Chr1 TAIR10 CDS 4706 5095 . + 0 Parent=AT1G01010.1,AT1G01010.1-Protein;
Chr1 TAIR10 exon 5174 5326 . + . Parent=AT1G01010.1
Chr1 TAIR10 CDS 5174 5326 . + 0 Parent=AT1G01010.1,AT1G01010.1-Protein;
Chr1 TAIR10 exon 5439 5899 . + . Parent=AT1G01010.1
Chr1 TAIR10 CDS 5439 5630 . + 0 Parent=AT1G01010.1,AT1G01010.1-Protein;
$ cut -f3 TAIR10_GFF3_genes.gff |sort |uniq
CDS
chromosome
exon
five_prime_UTR
gene
miRNA
mRNA
mRNA_TE_gene
ncRNA
protein
pseudogene
pseudogenic_exon
pseudogenic_transcript
rRNA
snoRNA
snRNA
three_prime_UTR
transposable_element_gene
tRNA
2. 根据注释文件的内容,对python教学视频的代码略作调整,以实现对拟南芥注释文件的探索;
[Python] 纯文本查看 复制代码
import sys
import re #在Python中使用正则表达式需要标准库中的一个包re
args = sys.argv
#sys.argv[]是用来获取命令行参数的,sys.argv[0]表示代码本身文件路径,所以参数从1开始
class Genome_info:
def __init__(self):
self.chr =""
self.start = 0
self.end = 0
class Gene(Genome_info):
def __init__(self):
Genome_info.__init__(self)
self.orientation =""
self.id = ""
class Transcript(Genome_info):
def __init__(self):
Genome_info.__init__(self)
self.id = ""
self.parent = ""
class Exon(Genome_info):
def __init__(self):
Genome_info.__init__(self)
self.parent = ""
def main(args):
"""
一个输入参数;
第一个参数为物种gtf文件
:return:
"""
list_chr = []
list_gene = {}
list_transcript = {}
list_exon = []
with open(args[1]) as fp_gtf:
for line in fp_gtf:
if line.startswith("#"): #注意:是starts 非 start
#startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False。
continue
#continue 语句用来告诉Python跳过当前循环的剩余语句,然后继续进行下一轮循环。
#continue 语句跳出本次循环,而break跳出整个循环。
lines = line.strip("\n").split("\t")
chr = lines[0] #染色体
type = lines[2] #如:gene,exon,...
start = int(lines[3])
end = int(lines[4])
orientation = lines[6]
attr = lines[8]
#m = re.search(pattern, string) 搜索整个字符串,直到发现符合的子字符串。
#import re
#m = re.search('[0-9]','abcd4ef')
#print(m.group(0))
#re.search()接收两个参数,第一个'[0-9]'表示从字符串想要找的是从0到9的一个数字字符;
#re.search()如果从第二个参数找到符合要求的子字符串,就返回一个对象m,
#通过m.group()的方法查看搜索到的结果。如果没有找到符合要求的字符,re.search()会返回None。
if not chr in list_chr:
list_chr.append(chr)
if type == "gene":
gene = Gene()
id = re.search(r'ID=([^;]+);?',attr).group(1)
#[url=http://www.runoob.com/python/python-reg-expressions.html]http://www.runoob.com/python/python-reg-expressions.html[/url]
##group(0)是整个正则表达的搜索结果,group(1)是第一个群……
#"+" 匹配1个或多个的表达式
#"?" 匹配0个或1个由前面的正则表达式定义的片段
#[^...] 不在[]中的字符; [^abc] 匹配除了a,b,c之外的字符。
#(re) 匹配括号内的表达式,也表示一个组
gene.chr = chr
gene.start = start
gene.end = end
gene.orientation = orientation
list_gene(id) = gene
#print(id)
elif type == "mRNA":
transcript = Transcript()
id = re.search(r'ID=([^;]+);?',attr).group(1)
parent =re.search(r'Parent=([^;]+);?',attr).group(1)
if not parent in list_gene:
continue
transcript.chr = chr
transcript.start = start
transcript.end = end
transcript.id = id
transcript.parent = parent
list_transcript(id) = transcript
elif type == "exon":
exon = Exon()
parent = re.search(r'Parent=([^;]+);?',attr).group(1)
if not parent in list_transcript:
continue
exon.chr = chr
exon.start = start
exon.end = end
exon.parent = parent
list_exon.append(exon)
chr_gene(list_gene)
gene_len(list_gene)
gene_transcript(list_transcript)
transcript_exon(list_exon)
exon_pos(list_exon)
def chr_gene(list_gene):
"""
chr_gene
:param list_gene:
:return:
"""
print("chr_gene")
count_gene = {}
for info in list_gene.values():
chr = info.chr
if chr in count_gene:
count_gene[info.chr] += 1
else:
count_gene[info.chr] = 1
with open("chr_gene.txt",'w') as fp_out:
for chr,num in count_gene.items():
print("\t".join([chr,str(num)]) + "\n")
fp_out.write("\t".join([chr,str(num)]) + "\n")
def gene_len(list_gene):
"""
gene_len
:param list_gene:
:return:
"""
print("gene_len")
with open("gene_len.txt",'w') as fp_out:
for gene_id,info in list_gene.items():
len = info.end - info.start + 1
print("\t".join([gene_id,str(len)]) + "\n")
fp_out.write("\t".join([gene_id,str(len)]) + "\n")
def gene_transcript(list_transcript):
"""
gene_transcript
:param list_transcript:
:return:
"""
print("gene_transcript")
count_transcript = {}
for info in list_transcript.values():
gene_id = info.parent
if gene_id in count_transcript:
count_transcript[gene_id] += 1
else:
count_transcript[gene_id] = 1
with open("gene_transcript.txt",'w') as fp_out:
for gene_id,num in count_transcript.items():
print("\t".join([gene_id,str(num)]) + "\n")
fp_out.write("\t".join([gene_id,str(num)]) + "\n")
def transcript_exon(list_exon):
"""
transcript_exon
:param list_exon:
:return:
"""
print("transcript_exon")
count_exon = {}
for exon in list_exon:
transcript_id = exon.parent
if transcript_id in count_exon:
count_exon[transcript_id] += 1
else:
count_exon[transcript_id] = 1
with open("transcript_exon.txt",'w') as fp_out:
for transcript_id,num in count_exon.items():
print("\t".join([transcript_id,str(num)]) + "\n")
fp_out.write("\t".join([transcript_id,str(num)]) + "\n")
def exon_pos(list_exon):
"""
exon_pos
:param list_exon:
:return:
"""
print("exon_pos")
count_exon = {}
for exon in list_exon():
transcript_id = exon.parent
if transcript_id in count_exon:
count_exon[transcript_id] += ",%s-%s"%(str(exon.start),str(exon.end))
else:
count_exon[transcript_id] = "%s-%s"%(str(exon.start),str(exon.end))
with open("exon_pos.txt",'w') as fp_out:
for transcript_id,pos in count_exon.items():
print("\t".join([transcript_id,pos]) + "\n")
fp_out.write("\t".join([transcript_id,pos]) + "\n")
if __name__ == "__main__":
main(args)
3. 第四题和第三题均使用python完成,并且,根据教学视频和不同的实验材料,对代码进行适当的思考与调整,从而能更好的理解整个程序,同时也觉得自己python的能力得到了大大的提高,感谢东哥!!!
|
|