搜索
楼主: Jimmy

生信编程直播第0题-生信编程很简单!

[复制链接]

3

主题

8

帖子

193

积分

注册会员

Rank: 2

积分
193
发表于 2017-12-19 19:26:19 | 显示全部楼层
稍作改动,使其可以读gz格式的文件,将其变成软件。
[Python] 纯文本查看 复制代码
# fastq_analysis_tool
#! /bin/python/
# encoding = utf-8
"""
#A Tool Deal With FASTQ:
#	1.cut off 5',3' bases(input the num)
#	2.the length distribution of reads
#	3.FASTQ to FASTA
#	4.the number of bases and count the ratio of GC
Author : Xiaoting Zhang
TIME:2017/12/3
"""
from optparse import OptionParser
import sys
import filetype
import gzip

args = sys.argv

def cut_fastq(filename,cut_5=0,cut_3=0):
	"""
	cut off 5' and 3' bases
	:param:filename  -- the file which will be dealed with
	:param:cut_5  -- 5' bases number which will be cutted off
	:param:cut_3  -- 3' bases number which will be cutted off

	"""
	kind=filetype.guess(filename)
	if kind is None:
		with open(filename) as f:
			while f:
				line_1 = f.readline().strip("\n")
				if not (line_1 and f):
					break 
				line_2 = f.readline().strip("\n")
				line_3 = f.readline().strip("\n")
				line_4 = f.readline().strip("\n")
				line2 = line_2[cut_5:-cut_3]
				line4 = line_4[cut_5:-cut_3]
				print(line_1,"\n",line2,"\n",line_3,"\n",line4)
	elif kind.extension == "gz":
		with gzip.open(filename,'rb') as gf:
			while gf:
				line_1 = gf.readline().decode("utf8").strip("\n")
				if not ( line_1 and gf):
					break
				line_2 = gf.readline().decode("utf8").strip("\n")
				line_3 = gf.readline().decode("utf8").strip("\n")
				line_4 = gf.readline().decode("utf8").strip("\n")
				line2 = line_2[cut_5:-cut_3]
				line4 = line_4[cut_5:-cut_3]
				print(line_1,"\n",line2,"\n",line_3,"\n",line4)

def reads_dis(filename):
	"""
	statistics the reads length distribution
	parma:filename -- a fastq format file which will be statistics
	
	"""
	read_dis = {}
	kind=filetype.guess(filename)
	if kind is None:
		with open(filename) as f:
			while f:
				line_1 = f.readline().strip("\n")
				if not (line_1 and f):
					break 
				line_2 = f.readline().strip("\n")
				line_3 = f.readline().strip("\n")
				line_4 = f.readline().strip("\n")
				length = len(line_2)
				if not length in read_dis:
					read_dis[length] = 0
				read_dis[length] += 1
	elif kind.extension == "gz":
		with gzip.open(filename) as gf:
			while gf:
				line_1 = gf.readline().decode("utf8").strip("\n")
				if not (line_1 and gf):
					break
				line_2 = gf.readline().decode("utf8").strip("\n")
				line_3 = gf.readline().decode("utf8").strip("\n")
				line_4 = gf.readline().decode("utf8").strip("\n")
				length=len(line_2)
				if not length in read_dis:
					read_dis[length] = 0
				read_dis[length] += 1
	for length,num in read_dis.items():
		print("length : %s \t Number : %s" % (length,num))
def fastq2fasta(filename):
	"""
	Convert fastq to fasta
	parma:filename  -- input the FASTQ format file
	the line_1 is ID and the line_2 is bases 
	"""
	kind=filetype.guess(filename)
	if kind is None:
		with open(filename) as f:
			while f:
				line_1 = f.readline().strip("\n")
				if not (line_1 and f):
					break 
				line_2 = f.readline().strip("\n")
				line_3 = f.readline().strip("\n")
				line_4 = f.readline().strip("\n")
				print(">",line_1)
				print(line_2)
	elif kind.extension == "gz":
		with gzip.open(filename) as gf:
			while gf:
				line_1 = gf.readline().decode("utf8").strip("\n")
				if not (line_1 and gf):
					break
				line_2 = gf.readline().decode("utf8").strip("\n")
				line_3 = gf.readline().decode("utf8").strip("\n")
				line_4 = gf.readline().decode("utf8").strip("\n")
				print(">",line_1)
				print(line_2)
def length_of_fq(filename):
	"""
	Sum the bases to count the length
	parma:the fastq filename
	
	"""
	length=0
	kind=filetype.guess(filename)
	if kind is None:
		with open(filename) as f:
			while f:
				line_1 = f.readline().strip("\n")
				if not (line_1 and f):
					break 
				line_2 = f.readline().strip("\n")
				line_3 = f.readline().strip("\n")
				line_4 = f.readline().strip("\n")
				length += len(line_2)
	elif kind.extension == "gz":
		with gzip.open(filename,'rb') as gf:
			while gf:
				line_1 = gf.readline().decode('utf8').strip("\n")
				#line_1 = [x.decode('utf8').strip() for x in gf.readlines()]
				if not (line_1 and gf):
					break
				line_2 = gf.readline().decode("utf8").strip("\n")
				line_3 = gf.readline().decode("utf8").strip("\n")
				line_4 = gf.readline().decode("utf8").strip("\n")
				length += len(line_2)
	print(length)
	return length
def gc_ratio(filename):
	"""
	count the GC ratio 
	param:filename  -- the input FASTQ format file
	
	"""
	count_gc = 0
	kind=filetype.guess(filename)
	if kind is None:
		with open(filename) as f:
			while f:
				line_1 = f.readline().strip("\n")
				if not (line_1 and f):
					break 
				line_2 = f.readline().strip("\n")
				line_3 = f.readline().strip("\n")
				line_4 = f.readline().strip("\n")
				count_gc += line_2.count("G")+line_2.count("C")
	elif kind.extension == "gz":
		with gzip.open(filename,'rb') as gf:
			while gf:
				line_1 = gf.readline().decode('utf8').strip("\n")
				if not (line_1 and gf):
					break
				line_2 = gf.readline().decode("utf8").strip("\n")
				line_3 = gf.readline().decode("utf8").strip("\n")
				line_4 = gf.readline().decode("utf8").strip("\n")
				count_gc += line_2.count("G")+line_2.count("C")
	ratio = 0
	total = length_of_fq(filename)
	ratio =( count_gc * 1.0 )/total
	print("GC : %s" % (ratio))

def main(args):
	parser = OptionParser()
	parser.add_option("-f","--fastq",dest="filename",help=".fastq format file or .fastq.gz format file",metavar="file")
	parser.add_option("--cut","--cut-fastq",dest="cut",help="cut fastq file",action="store_true",default=False)
	parser.add_option("-5","--cut-5",dest="cut_5",type=int,help="cut fastq N(N>0) bases from 5'",metavar="INT",default=0)
	parser.add_option("-3","--cut-3",dest="cut_3",type=int,help="cut fastq N(N>0) bases from 3'",metavar="INT",default=0)
	parser.add_option("--len-dis",dest="len_dis",help="sequence length distribution",action="store_true",default=False)
	parser.add_option("--fasta","--fastq2fasta",dest="fasta",help="convert fastq to fasta",action="store_true",default=False)
	parser.add_option("--gc","--count-gc",dest="gc",help="count the GC%",action="store_true",default=False)
	parser.add_option("--len","--fastq-length",dest="fq_len",help="total reads number",action="store_true",default=False)
	(options,args) = parser.parse_args()
	if not options.filename:
		parser.print_help()
	filename=options.filename
	if options.cut:
		cut_5 = options.cut_5
		cut_3 = options.cut_3
		cut_fastq(filename,cut_5,cut_3)
	if options.len_dis:
		reads_dis(filename)
	if options.fq_len:
		length_of_fq(filename)
	if options.fasta:
		fastq2fasta(filename)
	if options.gc:
		gc_ratio(filename)
if __name__ == '__main__':
	main(args)
回复 支持 1 反对 0

使用道具 举报

0

主题

12

帖子

165

积分

注册会员

Rank: 2

积分
165
发表于 2018-5-16 09:38:02 | 显示全部楼层
dongye 发表于 2017-3-18 19:04
生信编程直播第零题第一讲,Fastq文件简单统计处理
by dongye

请问一下 在统计fastq文件reads长度分布时 为什么
if not length in reads_len:
       reads_len[length] = 0
当这个长度不存在字典里的时候不是应该记为1吗?为什么是0呢?
      
回复 支持 反对

使用道具 举报

0

主题

5

帖子

87

积分

注册会员

Rank: 2

积分
87
发表于 2018-7-11 10:31:19 | 显示全部楼层
基础的视频在哪呢?
回复 支持 反对

使用道具 举报

11

主题

52

帖子

279

积分

中级会员

Rank: 3Rank: 3

积分
279
发表于 2018-7-18 16:01:37 | 显示全部楼层
居然TT 发表于 2018-7-11 10:31
基础的视频在哪呢?

我也在找。。。。
回复 支持 反对

使用道具 举报

11

主题

52

帖子

279

积分

中级会员

Rank: 3Rank: 3

积分
279
发表于 2018-7-18 16:03:24 | 显示全部楼层
Dorom 发表于 2018-5-16 09:38
请问一下 在统计fastq文件reads长度分布时 为什么
if not length in reads_len:
       reads_len[leng ...

在下方的语句里是+1操作,判断是否存在是防止加一个未初始的值导致错误,复制0下方+1就是1了
回复 支持 反对

使用道具 举报

0

主题

14

帖子

80

积分

注册会员

Rank: 2

积分
80
发表于 2018-7-27 17:08:38 | 显示全部楼层
## 5,3段截掉几个碱基

# !/usr/bin/python
# -*- coding: utf-8 -*-
##########################################
# Author    :    luhope
# Email      :    luhuilonghope@outlook.com
# Comment   :    using to remove the bases at both ends of the sequence (for fastq)
# Parameter :    python split_5_3_cut_side.py raw_data.fq 0 0 new_reads_1.fq ( = copy)
##########################################
import sys
args = sys.argv
f1 = open(args[4],'w')
with open(args[1],'r') as f:
    for (num,value) in enumerate(f):
        if num % 4 == 1  and int(args[3]) > 0:
            f1.write(value.strip()[int(args[2]):][:-int(args[3])]+'\n')
        elif num % 4 == 1 and int(args[3]) == 0:
            f1.write(value.strip()[int(args[2]):]+'\n')
        else:
            f1.write(value.strip()+'\n')
f1.close()
回复 支持 反对

使用道具 举报

0

主题

14

帖子

80

积分

注册会员

Rank: 2

积分
80
发表于 2018-7-27 17:15:13 | 显示全部楼层
dongye 发表于 2018-7-18 16:01
我也在找。。。。

视频收费啊,在腾讯云视频上https://ke.qq.com/course/285055
回复 支持 反对

使用道具 举报

0

主题

14

帖子

80

积分

注册会员

Rank: 2

积分
80
发表于 2018-7-27 17:16:48 | 显示全部楼层
回复 支持 反对

使用道具 举报

0

主题

14

帖子

80

积分

注册会员

Rank: 2

积分
80
发表于 2018-7-27 20:00:36 | 显示全部楼层
## 序列长度分布统计
# !/user/bin/python
# -*-coding: utf-8 -*-
##########################################
# Author    :    luhope
# Email       :    luhuilonghope@outlook.com
# Comment   :    The distribution of bases in the sequence (for fastq)
# Parameter  :    python seq_len_distrb.py new_data.fq read_distrb.xls
##########################################
import sys
args = sys.argv

lens = []
f1 = open(args[2],'w')
with open(args[1],'r') as f:
    for (num,value) in enumerate(f):
        if num % 4 == 1:
            lens.append(len(value))

f1.write('read_lenth'+'\t'+'number' + '\n')
for x in sorted(set(lens)):
    f1.write( '%d\t' % x +'%d\n' % lens.count(x))

f1.close()
回复 支持 反对

使用道具 举报

0

主题

14

帖子

80

积分

注册会员

Rank: 2

积分
80
发表于 2018-7-27 22:55:13 | 显示全部楼层
## FASTQ 转换成 FASTA

# !/usr/bin/python
# -*- coding: utf-8 -*-
##########################################
# Author    :    luhope
# Email     :    luhuilonghope@outlook.com
# Comment   :    using to convert fastq to fasta
# Parameter :    python fastaq2fasta.py raw_data.fq new_data.fa
##########################################
import sys
args = sys.argv

f1 = open(args[2],'w')
with open(args[1],'r') as f:
    for (num,value) in enumerate(f):
        if num % 4 == 0 or num % 4 == 1:
            f1.write(value.replace('@','>'))

f1.close()
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

QQ|手机版|小黑屋|生信技能树 ( 粤ICP备15016384号  

GMT+8, 2019-9-21 01:15 , Processed in 0.042506 second(s), 21 queries .

Powered by Discuz! X3.2

© 2001-2013 Comsenz Inc.