搜索
查看: 2987|回复: 4

芯片数据和转录组数据的差异分析代码大全

[复制链接]

634

主题

1182

帖子

4030

积分

管理员

Rank: 9Rank: 9Rank: 9

积分
4030
发表于 2017-9-27 20:49:05 | 显示全部楼层 |阅读模式
芯片数据的差异分析在R里面用到的包和转录组不太一样,所以分开给代码:首先是芯片数据的差异分析,如下
[AppleScript] 纯文本查看 复制代码
 

do_DEG_2groups  <- function(prefix='GSE1009',exprSet=example_exprSet ,group_list ,method,destdir='.'){
## 下面用#注释的代码是告诉你如何在R里面做测试数据
  # library(CLL)
  # data(sCLLex)
  # suppressMessages(library(limma))
  # exprSet = exprs(sCLLex)
  # pdata=pData(sCLLex)
  # group_list = pdata$Disease
  # do_DEG_2groups('CLL',exprSet,group_list)


  if (method=='limma'){
    library(limma)
    design=model.matrix(~factor(group_list))
    fit=lmFit(exprSet,design)
    fit=eBayes(fit)
    return(topTable(fit,coef=2,n=Inf))

  }else {
    dat=exprSet
    group1= which(group_list==levels(group_list)[1])
    group2= which( group_list==levels(group_list)[2])
    dat1=dat[,group1];dat2=dat[,group2]
    dat=cbind(dat1,dat2)
    if (method=='t.test'){
      library(pi0)
      pvals=matrix.t.test(dat,1,length(group1),length(group2))
    }else if (method=='samr'){
      library(samr)
      data=list(x=exprSet,y=as.numeric(as.factor(group_list)),
                geneid=as.character(1:nrow(exprSet)),
                genenames=rownames(exprSet),
                logged2=TRUE
      )
      samr.obj<-samr(data, resp.type="Two class unpaired", nperms=1000)
      pv=samr.pvalues.from.perms(samr.obj$tt, samr.obj$ttstar)
      #delta.table <- samr.compute.delta.table(samr.obj)
      del<- 0.3
      #samr.plot(samr.obj, del=.3)
      #siggenes.table<- samr.compute.siggenes.table(samr.obj, del, data, delta.table) ## a list contains up gene list and down gene list
      pvals=pv
    }else  if (method=='rowttest'){
      library(genefilter)
      library(impute)
      t.test <- rowttests(exprSet, fac= group_list)
      t.test[1:10, ]
      pvals= t.test$p.value
    }else  if (method=='resamplingPvalues'){

      library(BioNet)
      results <- resamplingPvalues(exprSet, group_list, alternative="two.sided")
      head(results$p.values,10)
      pvals=results$p.values
    }else {
      stop('we just affort limma,samr,rowttest in genefilter,resamplingPvalues in BioNet,t.test to do DEG analysis!!!')
    }

    p.adj=p.adjust(pvals,method = 'BH')
    avg_1=rowMeans(dat1);avg_2=rowMeans(dat2);
    FC=avg_2/avg_1;
    results=cbind(avg_1,avg_2,FC,pvals,p.adj)
    colnames(results)=c('avg_1','avg_2','logFC','P.Value','adj.P.Val') ## make sure return the same column name with limma
    rownames(results)=rownames(dat)
    results=as.data.frame(results)
    return(results[order(results$P.Value),])

  }
}



可以看到,上面的代码涉及到了limma,samr,BioNet,还有简单的t检验,都可以用来做差异分析,但是这个函数我写的时候只支持2个比较。

其实可以多比较,看下面的转录组数据的代码,就很容易理解了。

[AppleScript] 纯文本查看 复制代码
rm(list=ls())
library(reshape2)
library(edgeR)
library(DESeq2)

setwd("G:/mRNA/DEG")
a=read.table('hisat2_mm10_htseq.txt',stringsAsFactors = F)
######################################################################
#ESCTSA01.geneCounts	Nek1	2790
#ESCTSA01.geneCounts	Nek10	18
#ESCTSA01.geneCounts	Nek11	2
#ESCTSA01.geneCounts	Nek2	4945
######################################################################
colnames(a)=c('sample','gene','reads')
exprSet=dcast(a,gene~sample)
head(exprSet)

# write.table(exprSet[grep("^__",exprSet$gene),],'hisat2.stats.txt',quote=F,sep='\t')
# exprSet=exprSet[!grepl("^__",exprSet$gene),] 

geneLists=exprSet[,1]
exprSet=exprSet[,-1]
head(exprSet)

rownames(exprSet)=geneLists
colnames(exprSet)=do.call(rbind,strsplit(colnames(exprSet),'\\.'))[,1]

write.csv(exprSet,'raw_reads_matrix.csv') 

keepGene=rowSums(cpm(exprSet)>0) >=2
table(keepGene);dim(exprSet)
dim(exprSet[keepGene,])
exprSet=exprSet[keepGene,]
rownames(exprSet)=geneLists[keepGene]

str(exprSet)

group_list=c('control','control','treat_12','treat_12','treat_2','treat_2')

write.csv(exprSet,'filter_reads_matrix.csv' )
 


######################################################################
###################      Firstly for DEseq2      #####################
######################################################################
if(T){
  
  suppressMessages(library(DESeq2)) 
  (colData <- data.frame(row.names=colnames(exprSet), group_list=group_list) )
  dds <- DESeqDataSetFromMatrix(countData = exprSet,
                                colData = colData,
                                design = ~ group_list)
  dds <- DESeq(dds)
  png("qc_dispersions.png", 1000, 1000, pointsize=20)
  plotDispEsts(dds, main="Dispersion plot")
  dev.off()
  
  
  rld <- rlogTransformation(dds)
  exprMatrix_rlog=assay(rld) 
  write.csv(exprMatrix_rlog,'exprMatrix.rlog.csv' )
  
  normalizedCounts1 <- t( t(counts(dds)) / sizeFactors(dds) )
  # normalizedCounts2 <- counts(dds, normalized=T) # it's the same for the tpm value
  # we also can try cpm or rpkm from edgeR pacage
  exprMatrix_rpm=as.data.frame(normalizedCounts1) 
  head(exprMatrix_rpm)
  write.csv(exprMatrix_rpm,'exprMatrix.rpm.csv' )
  
  png("DEseq_RAWvsNORM.png",height = 800,width = 800)
  par(cex = 0.7)
  n.sample=ncol(exprSet)
  if(n.sample>40) par(cex = 0.5)
  cols <- rainbow(n.sample*1.2)
  par(mfrow=c(2,2))
  boxplot(exprSet, col = cols,main="expression value",las=2)
  boxplot(exprMatrix_rlog, col = cols,main="expression value",las=2)
  hist(as.matrix(exprSet))
  hist(exprMatrix_rlog)
  dev.off()
  
  library(RColorBrewer)
  (mycols <- brewer.pal(8, "Dark2")[1:length(unique(group_list))])
  cor(as.matrix(exprSet))
  # Sample distance heatmap
  sampleDists <- as.matrix(dist(t(exprMatrix_rlog)))
  #install.packages("gplots",repos = "http://cran.us.r-project.org")
  library(gplots)
  png("qc-heatmap-samples.png", w=1000, h=1000, pointsize=20)
  heatmap.2(as.matrix(sampleDists), key=F, trace="none",
            col=colorpanel(100, "black", "white"),
            ColSideColors=mycols[group_list], RowSideColors=mycols[group_list],
            margin=c(10, 10), main="Sample Distance Matrix")
  dev.off()
  
  cor(exprMatrix_rlog) 
  
  
  res <- results(dds, contrast=c("group_list","treat_2","control"))
  resOrdered <- res[order(res$padj),]
  head(resOrdered)
  DEG_treat_2=as.data.frame(resOrdered)
  write.csv(DEG_treat_2,"DEG_treat_2_deseq2.results.csv")
  
  res <- results(dds, contrast=c("group_list","treat_12","control"))
  resOrdered <- res[order(res$padj),]
  head(resOrdered)
  DEG_treat_12=as.data.frame(resOrdered)
  write.csv(DEG_treat_12,"DEG_treat_12_deseq2.results.csv")
  
  
  
}

######################################################################
###################      Then  for edgeR        #####################
######################################################################
if(T){
  
  library(edgeR)
  d <- DGEList(counts=exprSet,group=factor(group_list))
  d$samples$lib.size <- colSums(d$counts)
  d <- calcNormFactors(d)
  d$samples
  
  ## The calcNormFactors function normalizes for RNA composition by finding a set of scaling
  ## factors for the library sizes that minimize the log-fold changes between the samples for most
  ## genes. The default method for computing these scale factors uses a trimmed mean of Mvalues
  ## (TMM) between each pair of samples
  
  png('edgeR_MDS.png')
  plotMDS(d, method="bcv", col=as.numeric(d$samples$group))
  dev.off()
  
  # The glm approach to multiple groups is similar to the classic approach, but permits more general comparisons to be made
  
  dge=d
 
  design <- model.matrix(~0+factor(group_list))
  rownames(design)<-colnames(dge)
  colnames(design)<-levels(factor(group_list))
  
  dge <- estimateGLMCommonDisp(dge,design)
  dge <- estimateGLMTrendedDisp(dge, design)
  dge <- estimateGLMTagwiseDisp(dge, design)
  
  fit <- glmFit(dge, design)
  
  lrt <- glmLRT(fit,  contrast=c(-1,1,0))
  nrDEG=topTags(lrt, n=nrow(exprSet))
  nrDEG=as.data.frame(nrDEG)
  head(nrDEG)
  write.csv(nrDEG,"DEG_treat_12_edgeR.csv",quote = F)
  
  lrt <- glmLRT(fit, contrast=c(-1,0,1) )
  nrDEG=topTags(lrt, n=nrow(exprSet))
  nrDEG=as.data.frame(nrDEG)
  head(nrDEG)
  write.csv(nrDEG,"DEG_treat_2_edgeR.csv",quote = F)
  summary(decideTests(lrt))
  plotMD(lrt)
  abline(h=c(-1, 1), col="blue")
}

######################################################################
###################      Then  for limma/voom        #################
######################################################################


suppressMessages(library(limma))
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(exprSet)

dge <- DGEList(counts=exprSet)
dge <- calcNormFactors(dge)
logCPM <- cpm(dge, log=TRUE, prior.count=3)

v <- voom(dge,design,plot=TRUE, normalize="quantile")
fit <- lmFit(v, design)

group_list
cont.matrix=makeContrasts(contrasts=c('treat_12-control','treat_2-control'),levels = design)
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)
 
tempOutput = topTable(fit2, coef='treat_12-control', n=Inf)
DEG_treat_12_limma_voom = na.omit(tempOutput)
write.csv(DEG_treat_12_limma_voom,"DEG_treat_12_limma_voom.csv",quote = F)

tempOutput = topTable(fit2, coef='treat_2-control', n=Inf)
DEG_treat_2_limma_voom = na.omit(tempOutput)
write.csv(DEG_treat_2_limma_voom,"DEG_treat_2_limma_voom.csv",quote = F)



png("limma_voom_RAWvsNORM.png",height = 600,width = 600)
exprSet_new=v$E
par(cex = 0.7)
n.sample=ncol(exprSet)
if(n.sample>40) par(cex = 0.5)
cols <- rainbow(n.sample*1.2)
par(mfrow=c(2,2))
boxplot(exprSet, col = cols,main="expression value",las=2)
boxplot(exprSet_new, col = cols,main="expression value",las=2)
hist(as.matrix(exprSet))
hist(exprSet_new)
dev.off()










上一篇:ChIP的input也能call出peak来怎么办?
下一篇:比对工具代码大全
你这个问题很复杂,需要打赏,请点击 http://www.bio-info-trainee.com/donate 进行打赏,谢谢
回复

使用道具 举报

2

主题

41

帖子

387

积分

中级会员

Rank: 3Rank: 3

积分
387
发表于 2017-9-27 23:51:44 | 显示全部楼层
谢谢分享!还没来得及细看
回复 支持 反对

使用道具 举报

4

主题

20

帖子

395

积分

中级会员

Rank: 3Rank: 3

积分
395
发表于 2017-9-28 17:53:26 | 显示全部楼层
这绝对是好东西,收藏了,谢谢分享!
回复 支持 反对

使用道具 举报

0

主题

5

帖子

77

积分

注册会员

Rank: 2

积分
77
QQ
发表于 2017-11-27 19:36:57 | 显示全部楼层
大神,你好。不好意思打扰你了。我看了你写的那篇芯片数据和转录组数据的差异分析代码大全的帖子,非常受用,非常感谢。但是有一个小问题想请教一下大神,我在运行到fit=lmFit(exprSet,design)这个步骤时,显示error,原因是我的行数和列数不匹配,row dimension of design doesn't match column dimension of data object.> dim(exprSet)
[1] 20502    10
> dim(design)
[1] 4 4
> 我找了很久也不知道如何解决这个问题,非常着急。请大神帮忙指导一下,万分感谢
回复 支持 反对

使用道具 举报

0

主题

5

帖子

77

积分

注册会员

Rank: 2

积分
77
QQ
发表于 2017-11-27 19:45:20 | 显示全部楼层
wangtian1797 发表于 2017-11-27 19:36
大神,你好。不好意思打扰你了。我看了你写的那篇芯片数据和转录组数据的差异分析代码大全的帖子,非常受用 ...

实在不好意思,回复错了,应该是“表达芯片数据分析实战四:配对样本差异分析”那篇帖子
回复 支持 反对

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

QQ|手机版|小黑屋|生信技能树 ( 粤ICP备15016384号  

GMT+8, 2019-9-21 14:12 , Processed in 0.048736 second(s), 26 queries .

Powered by Discuz! X3.2

© 2001-2013 Comsenz Inc.