|
发表于 2017-2-9 21:30:30
|
显示全部楼层
本帖最后由 zckoo007 于 2017-2-9 21:45 编辑
002+Python+R
1. 下载pycharm 摸索了三天,了解一些基本功能,project创建(pycharm 相当强大!)
2. 下载Git bash 一天,应为不会Linux,以后慢慢摸索
3. python 函数的定义 两天,
4. python class 一天
5. 读懂代码 两天
6. 感谢 dongye 哥!
[Python] 纯文本查看 复制代码 import sys
import re
args = sys.argv
'''sys.argv 是命令行参数,是一个字符串列表,0代表其路径'''
class Genome_info: #这是一个基类,所有类型都通用的
def __init__(self):
self.chr = "" #染色体号
self.start = 0
self.end = 0
class Gene(Genome_info): #这个函数继承了基类
def __init__(self):
Genome_info.__init__(self)
self.orientation = ""
self.id = ""
class Transcript(Genome_info):
def __init__(self):
Genome_info.__init__(self)
self.id = ""
self.parent = ""
class Exon(Genome_info):
def __init__(self):
Genome_info.__init__(self)
self.parent = ""
def main(args):
"""
一个输入参数:
第一个参数为物种gtf文件
:return:
"""
list_chr = []
list_gene = {} #因为有 id 号 所以用dir
list_transcript = {}
list_exon = []
# l_n = 0
with open(args[1]) as fp_gtf: #打开文件遍历GTF
for line in fp_gtf:
if line.startswith("#"): #号开头的注释过滤掉,不统计这个
continue
# print ("in %s" % l_n)
# l_n += 1
lines = line.strip("\n").split("\t")
chr = lines[0]
type = lines[2]
start = int(lines[3])
end = int(lines[4])
orientation = lines[6]
attr = lines[8]
if not re.search(r'protein_coding', attr): #取到的是蛋白的编码,如果没有的话就跳过,不做统计了
continue
if not chr in list_chr: #把染色体的类型添加到列表里
list_chr.append(chr)
if type == "gene":
gene = Gene() #初始化一个基因对象
id = re.search(r'gene_id "([^;]+)";?', attr).group(1) # 0 返回所有列表,1取第一个
gene.chr = chr
gene.start = start
gene.end = end
gene.id = id
gene.orientation = orientation
list_gene[id] = gene
# print(id)
elif type == "transcript":
transcript = Transcript()
id = re.search(r'transcript_id "([^;]+)";?', attr).group(1)
parent = re.search(r'gene_id "([^;]+)";?', attr).group(1)
if not parent in list_gene:
continue
transcript.chr = chr
transcript.start = start
transcript.end = end
transcript.id = id
transcript.parent = parent
list_transcript[id] = transcript
elif type == "exon":
exon = Exon()
parent = re.search(r'transcript_id "([^;]+)";?', attr).group(1)
if not parent in list_transcript:
continue
exon.chr = chr
exon.start = start
exon.end = end
exon.parent = parent
list_exon.append(exon)
chr_gene(list_gene)
gene_len(list_gene)
gene_transcript(list_transcript)
transcript_exon(list_exon)
exon_pos(list_exon)
def chr_gene(list_gene):
"""
染色体上基因数量分布
:param list_gene:
:return:
"""
print("染色体上基因数量分布")
count_gene = {} #这是一个计数器
for info in list_gene.values():
chr = info.chr
if chr in count_gene:
count_gene[info.chr] += 1
else:
count_gene[info.chr] = 1
with open("chr_gene.txt", 'w') as fp_out:
for chr, num in count_gene.items():
print("\t".join([chr, str(num)]) + "\n")
fp_out.write("\t".join([chr, str(num)]) + "\n")
def gene_len(list_gene):
"""
基因长度分布情况
:param list_gene:
:return:
"""
print("基因长度分布情况")
with open("gene_len.txt", 'w') as fp_out:
for gene_id, info in list_gene.items():
len = info.end - info.start + 1
fp_out.write("\t".join([gene_id, str(len)]) + "\n")
print("\t".join([gene_id, str(len)]) + "\n")
def gene_transcript(list_transcript):
"""
基因的转录本数量分布
:param list_transcript:
:return:
"""
print("基因的转录本数量分布")
count_transcript = {}
for info in list_transcript.values():
gene_id = info.parent
if gene_id in count_transcript:
count_transcript[gene_id] += 1
else:
count_transcript[gene_id] = 1
with open("gene_transcript.txt", 'w') as fp_out:
for gene_id, num in count_transcript.items():
print("\t".join([gene_id, str(num)]) + "\n")
fp_out.write("\t".join([gene_id, str(num)]) + "\n")
def transcript_exon(list_exon):
"""
转录本的外显子数量统计
:param list_exon:
:return:
"""
print("转录本的外显子数量统计")
count_exon = {}
for exon in list_exon:
transcript_id = exon.parent
if transcript_id in count_exon:
count_exon[transcript_id] += 1
else:
count_exon[transcript_id] = 1
with open("transcript_exon.txt", 'w') as fp_out:
for transcript_id, num in count_exon.items():
print("\t".join([transcript_id, str(num)]) + "\n")
fp_out.write("\t".join([transcript_id, str(num)]) + "\n")
def exon_pos(list_exon):
"""
外显子坐标统计
:param list_exon:
:return:
"""
print("外显子坐标统计")
count_exon = {}
for exon in list_exon:
transcript_id = exon.parent
if transcript_id in count_exon:
count_exon[transcript_id] += ",%s-%s" % (str(exon.start), str(exon.end))
else:
count_exon[transcript_id] = "%s-%s" % (str(exon.start), str(exon.end))
with open("exon_pos.txt", 'w') as fp_out:
for transcript_id, pos in count_exon.items():
print("\t".join([transcript_id, pos]) + "\n")
fp_out.write("\t".join([transcript_id, pos]) + "\n")
def gene_exon_pos(list_gene, list_transcript, list_exon):
"""
根据exon的parent将所有exon对应到transcript
根据transcript的parent将所有transcript对应到gene
根据gene按chr分组得到chromosome列表
从chromosome中输出某个指定基因的所有外显子坐标信息并画图
生信编程直播第五题
:param list_gene:
:param list_transcript:
:param list_exon:
:return:
"""
pass
if __name__ == "__main__":
main(args)
染色体上基因数量分布
1 5
基因长度分布情况
ENSG00000186092 918
ENSG00000279928 1766
ENSG00000279457 15400
ENSG00000278566 939
ENSG00000273547 939
基因的转录本数量分布
ENSG00000186092 1
ENSG00000279928 1
ENSG00000279457 3
ENSG00000278566 1
ENSG00000273547 1
转录本的外显子数量统计
ENST00000335137 1
ENST00000624431 3
ENST00000623834 10
ENST00000623083 11
ENST00000624735 15
ENST00000426406 1
ENST00000332831 1
外显子坐标统计
ENST00000335137 69091-70008
ENST00000624431 182393-182746,183114-183240,183922-184158
ENST00000623834 195259-195411,188791-188892,188489-188584,188439-188486,188130-188266,188022-188028,187755-187886,187376-187577,185491-185559,184923-185350
ENST00000623083 195263-195411,188791-188902,188489-188584,188439-188486,188126-188266,187755-187886,187376-187577,187129-187287,186317-186469,185491-185559,184925-185350
ENST00000624735 200050-200322,195259-195416,188791-188889,188489-188584,188439-188486,188130-188266,188102-188105,187755-187886,187380-187577,187270-187287,187129-187267,186317-186469,185529-185559,184977-185049,184927-184971
ENST00000426406 450740-451678
ENST00000332831 685716-686654
Process finished with exit code 0
|
|