1. 程式人生 > >利用R語言對RNA-Seq進行探索分析與差異表達分析

利用R語言對RNA-Seq進行探索分析與差異表達分析

介紹

本文參考 bioconductor 中RNA-Seq workflow: gene-level exploratory analysis and differential expression並對其根據需要進行了增減。

試驗資料

資料來源

Himes BE, Jiang X, Wagner P, Hu R, Wang Q, Klanderman B, Whitaker RM, Duan Q, Lasky-Su J, Nikolos C, Jester W, Johnson M, Panettieri R Jr, Tantisira KG, Weiss ST, Lu Q. “RNA-Seq Transcriptome Profiling Identifies CRISPLD2 as a Glucocorticoid Responsive Gene that Modulates Cytokine Function in Airway Smooth Muscle Cells.” PLoS One. 2014 Jun 13;9(6):e99625. PMID: 24926665. GEO: GSE52778.

在這個RNA-Seq試驗中,採用了4種呼吸道平滑肌肉細胞(airway smooth muscle cells),每種細胞均有 地塞米松治療、非治療兩類。共計8個樣本,儲存在 airway 包中。

原始資料的處理

高通量測序資料常採用 FASTQ 格式來保 存所測的鹼基讀段和質量分數。如圖 所示,FASTQ 格式以測序讀段為單位存 儲,每條讀段佔 4 行,其中第一行和的第三行由檔案識別標誌和讀段名(ID)組成(第一行以“@”開頭而第三行以“+”開頭;第三行中 ID 可以省略,但“+”不能省 略),第二行為鹼基序列,第四行為各鹼基所對應的測序質量分數序列。

這裡寫圖片描述

採用 tophat/bowtie2 將原始資料fastq對映到基因組序列,得到bam檔案;
此處我們採用airway 自帶的bam檔案。

載入 airway 包, 並利用自帶bam檔案

library("airway")
dir <- system.file("extdata", package="airway", mustWork=TRUE)
list.files(dir) # 檔案列表
##  [1] "GSE52778_series_matrix.txt"        "Homo_sapiens.GRCh37.75_subset.gtf"
##  [3] "sample_table.csv"                  "SraRunInfo_SRP033351.csv"         
##  
[5] "SRR1039508_subset.bam" "SRR1039508_subset.bam.bai" ## [7] "SRR1039509_subset.bam" "SRR1039512_subset.bam" ## [9] "SRR1039513_subset.bam" "SRR1039516_subset.bam" ## [11] "SRR1039517_subset.bam" "SRR1039520_subset.bam" ## [13] "SRR1039521_subset.bam"
csvfile <- file.path(dir,"sample_table.csv")
(sampleTable <- read.csv(csvfile,row.names=1)) # 獲取樣本資訊
##            SampleName    cell   dex albut        Run avgLength Experiment    Sample    BioSample
## SRR1039508 GSM1275862  N61311 untrt untrt SRR1039508       126  SRX384345 SRS508568 SAMN02422669
## SRR1039509 GSM1275863  N61311   trt untrt SRR1039509       126  SRX384346 SRS508567 SAMN02422675
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512       126  SRX384349 SRS508571 SAMN02422678
## SRR1039513 GSM1275867 N052611   trt untrt SRR1039513        87  SRX384350 SRS508572 SAMN02422670
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516       120  SRX384353 SRS508575 SAMN02422682
## SRR1039517 GSM1275871 N080611   trt untrt SRR1039517       126  SRX384354 SRS508576 SAMN02422673
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520       101  SRX384357 SRS508579 SAMN02422683
## SRR1039521 GSM1275875 N061011   trt untrt SRR1039521        98  SRX384358 SRS508580 SAMN02422677
filenames <- file.path(dir, paste0(sampleTable$Run, "_subset.bam")) # 提取bam檔案

獲取bam資料

library("Rsamtools")
filenames  
## [1] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039508_subset.bam"
## [2] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039509_subset.bam"
## [3] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039512_subset.bam"
## [4] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039513_subset.bam"
## [5] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039516_subset.bam"
## [6] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039517_subset.bam"
## [7] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039520_subset.bam"
## [8] "/Library/Frameworks/R.framework/Versions/library/airway/extdata/SRR1039521_subset.bam"
bamfiles <- BamFileList(filenames, yieldSize=2000000) #  將bam檔案放入列表,yieldSize 表示每次被讀取的記錄數
seqinfo(bamfiles) # 序列的基本資訊
## Seqinfo object with 84 sequences from an unspecified genome:
##   seqnames   seqlengths isCircular genome
##   1           249250621       <NA>   <NA>
##   10          135534747       <NA>   <NA>
##   11          135006516       <NA>   <NA>
##   12          133851895       <NA>   <NA>
##   13          115169878       <NA>   <NA>
##   ...               ...        ...    ...
##   GL000210.1      27682       <NA>   <NA>
##   GL000231.1      27386       <NA>   <NA>
##   GL000229.1      19913       <NA>   <NA>
##   GL000226.1      15008       <NA>   <NA>
##   GL000207.1       4262       <NA>   <NA>
seqlevels(bamfiles) # 所有的染色體名稱, GLxxxxxx.x表示genomic contig
##  [1] "1"          "10"         "11"         "12"         "13"         "14"         "15"        
##  [8] "16"         "17"         "18"         "19"         "2"          "20"         "21"        
## [15] "22"         "3"          "4"          "5"          "6"          "7"          "8"         
## [22] "9"          "MT"         "X"          "Y"          "GL000192.1" "GL000225.1" "GL000194.1"
## [29] "GL000193.1" "GL000200.1" "GL000222.1" "GL000212.1" "GL000195.1" "GL000223.1" "GL000224.1"
## [36] "GL000219.1" "GL000205.1" "GL000215.1" "GL000216.1" "GL000217.1" "GL000199.1" "GL000211.1"
## [43] "GL000213.1" "GL000220.1" "GL000218.1" "GL000209.1" "GL000221.1" "GL000214.1" "GL000228.1"
## [50] "GL000227.1" "GL000191.1" "GL000208.1" "GL000198.1" "GL000204.1" "GL000233.1" "GL000237.1"
## [57] "GL000230.1" "GL000242.1" "GL000243.1" "GL000241.1" "GL000236.1" "GL000240.1" "GL000206.1"
## [64] "GL000232.1" "GL000234.1" "GL000202.1" "GL000238.1" "GL000244.1" "GL000248.1" "GL000196.1"
## [71] "GL000249.1" "GL000246.1" "GL000203.1" "GL000197.1" "GL000245.1" "GL000247.1" "GL000201.1"
## [78] "GL000235.1" "GL000239.1" "GL000210.1" "GL000231.1" "GL000229.1" "GL000226.1" "GL000207.1"

匯入基因組特徵(註釋)

eg. 外顯子的染色體位置, 基因的起始、終止位點

library("GenomicFeatures")
gtffile <- file.path(dir,"Homo_sapiens.GRCh37.75_subset.gtf")
(txdb <- makeTxDbFromGFF(gtffile, format="gtf"))
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: /Library/Frameworks/R.framework/Versions/library/airway/extdata/Homo_sapiens.GRCh37.75_subset.gtf
## # Organism: NA
## # miRBase build ID: NA
## # Genome: NA
## # transcript_nrow: 65
## # exon_nrow: 279
## # cds_nrow: 158
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-06-17 17:40:06 +0800 (Wed, 17 Jun 2015)
## # GenomicFeatures version at creation time: 1.20.1
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1
(genes <- exonsBy(txdb, by="gene"))
## GRangesList object of length 20:
## $ENSG00000009724 
## GRanges object with 18 ranges and 2 metadata columns:
##        seqnames               ranges strand   |   exon_id       exon_name
##           <Rle>            <IRanges>  <Rle>   | <integer>     <character>
##    [1]        1 [11086580, 11087705]      -   |        98 ENSE00000818830
##    [2]        1 [11090233, 11090307]      -   |        99 ENSE00000472123
##    [3]        1 [11090805, 11090939]      -   |       100 ENSE00000743084
##    [4]        1 [11094885, 11094963]      -   |       101 ENSE00000743085
##    [5]        1 [11097750, 11097868]      -   |       103 ENSE00003520086
##    ...      ...                  ...    ... ...       ...             ...
##   [14]        1 [11106948, 11107176]      -   |       111 ENSE00003467404
##   [15]        1 [11106948, 11107176]      -   |       112 ENSE00003489217
##   [16]        1 [11107260, 11107280]      -   |       113 ENSE00001833377
##   [17]        1 [11107260, 11107284]      -   |       114 ENSE00001472289
##   [18]        1 [11107260, 11107290]      -   |       115 ENSE00001881401
## 
## ...
## <19 more elements>
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
seqlevels(genes) # 染色體的名字
## [1] "1"

染色體的名稱 seqlevels(bamfiles)seqlevels(genes) 應該保持一致, 特別留意是否含有 “chr”, 要麼都有”chr”, 要麼都沒有。

基因計數

library("GenomicAlignments")
se <- summarizeOverlaps(features = genes, reads = bamfiles,
                        mode = "Union",  # 讀段覆蓋的模式
                        singleEnd=FALSE, #雙末端 not 單末端
                        ignore.strand=TRUE,# True 表示忽略±鏈的限制
                        fragments=TRUE  ) # 只應用於雙末端測序,true表示非成對的對端應該被計數
class(se) # 得到 SummarizedExperiment 資料,可用於後續計算
## [1] "SummarizedExperiment"
## attr(,"package")
## [1] "GenomicRanges"

這裡寫圖片描述

上圖顯示的是SummarizedExperiment類(以及他的子類DESeqDataSet)的佈局, 粉紅色 assay(se) 表示實際的資料, 每行為一個基因,每列為一個樣本;
colData 表示樣本的具體資訊,隨後我們會對它進行填充;rowRanges 表示每一個基因的資訊。具體如下

se
## class: SummarizedExperiment 
## dim: 20 8 
## exptData(0):
## assays(1): counts
## rownames(20): ENSG00000009724 ENSG00000116649 ... ENSG00000271794 ENSG00000271895
## rowRanges metadata column names(0):
## colnames(8): SRR1039508_subset.bam SRR1039509_subset.bam ... SRR1039520_subset.bam
##   SRR1039521_subset.bam
## colData names(0):
head(assay(se))
##                 SRR1039508_subset.bam SRR1039509_subset.bam SRR1039512_subset.bam
## ENSG00000009724                    38                    28                    66
## ENSG00000116649                  1004                  1255                  1122
## ENSG00000120942                   218                   256                   233
## ENSG00000120948                  2751                  2080                  3353
## ENSG00000171819                     4                    50                    19
## ENSG00000171824                   869                  1075                  1115
##                 SRR1039513_subset.bam SRR1039516_subset.bam SRR1039517_subset.bam
## ENSG00000009724                    24                    42                    41
## ENSG00000116649                  1313                  1100                  1879
## ENSG00000120942                   252                   269                   465
## ENSG00000120948                  1614                  3519                  3716
## ENSG00000171819                   543                     1                    10
## ENSG00000171824                  1051                   944                  1405
##                 SRR1039520_subset.bam SRR1039521_subset.bam
## ENSG00000009724                    47                    36
## ENSG00000116649                   745                  1536
## ENSG00000120942                   207                   400
## ENSG00000120948                  2220                  1990
## ENSG00000171819                    14                  1067
## ENSG00000171824                   748                  1590
colSums(assay(se))
## SRR1039508_subset.bam SRR1039509_subset.bam SRR1039512_subset.bam SRR1039513_subset.bam 
##                  6478                  6501                  7699                  6801 
## SRR1039516_subset.bam SRR1039517_subset.bam SRR1039520_subset.bam SRR1039521_subset.bam 
##                  8009                 10849                  5254                  9168
colData(se)
## DataFrame with 8 rows and 0 columns
rowRanges(se)
## GRangesList object of length 20:
## $ENSG00000009724 
## GRanges object with 18 ranges and 2 metadata columns:
##        seqnames               ranges strand   |   exon_id       exon_name
##           <Rle>            <IRanges>  <Rle>   | <integer>     <character>
##    [1]        1 [11086580, 11087705]      -   |        98 ENSE00000818830
##    [2]        1 [11090233, 11090307]      -   |        99 ENSE00000472123
##    [3]        1 [11090805, 11090939]      -   |       100 ENSE00000743084
##    [4]        1 [11094885, 11094963]      -   |       101 ENSE00000743085
##    [5]        1 [11097750, 11097868]      -   |       103 ENSE00003520086
##    ...      ...                  ...    ... ...       ...             ...
##   [14]        1 [11106948, 11107176]      -   |       111 ENSE00003467404
##   [15]        1 [11106948, 11107176]      -   |       112 ENSE00003489217
##   [16]        1 [11107260, 11107280]      -   |       113 ENSE00001833377
##   [17]        1 [11107260, 11107284]      -   |       114 ENSE00001472289
##   [18]        1 [11107260, 11107290]      -   |       115 ENSE00001881401
## 
## ...
## <19 more elements>
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
str(metadata(rowRanges(se)))
## List of 1
##  $ genomeInfo:List of 14
##   ..$ Db type                                 : chr "TxDb"
##   ..$ Supporting package                      : chr "GenomicFeatures"
##   ..$ Data source                             : chr "/Library/Frameworks/R.framework/Versions/library/airway/extdata/Homo_sapiens.GRCh37.75_subset.gtf"
##   ..$ Organism                                : chr NA
##   ..$ miRBase build ID                        : chr NA
##   ..$ Genome                                  : chr NA
##   ..$ transcript_nrow                         : chr "65"
##   ..$ exon_nrow                               : chr "279"
##   ..$ cds_nrow                                : chr "158"
##   ..$ Db created by                           : chr "GenomicFeatures package from Bioconductor"
##   ..$ Creation time                           : chr "2015-06-17 17:40:06 +0800 (Wed, 17 Jun 2015)"
##   ..$ GenomicFeatures version at creation time: chr "1.20.1"
##   ..$ RSQLite version at creation time        : chr "1.0.0"
##   ..$ DBSCHEMAVERSION                         : chr "1.1"
(colData(se) <- DataFrame(sampleTable)) # 填充樣本的具體資訊,方便後續分組,尋找差異基因
## DataFrame with 8 rows and 9 columns
##            SampleName     cell      dex    albut        Run avgLength Experiment    Sample
##              <factor> <factor> <factor> <factor>   <factor> <integer>   <factor>  <factor>
## SRR1039508 GSM1275862   N61311    untrt    untrt SRR1039508       126  SRX384345 SRS508568
## SRR1039509 GSM1275863   N61311      trt    untrt SRR1039509       126  SRX384346 SRS508567
## SRR1039512 GSM1275866  N052611    untrt    untrt SRR1039512       126  SRX384349 SRS508571
## SRR1039513 GSM1275867  N052611      trt    untrt SRR1039513        87  SRX384350 SRS508572
## SRR1039516 GSM1275870  N080611    untrt    untrt SRR1039516       120  SRX384353 SRS508575
## SRR1039517 GSM1275871  N080611      trt    untrt SRR1039517       126  SRX384354 SRS508576
## SRR1039520 GSM1275874  N061011    untrt    untrt SRR1039520       101  SRX384357 SRS508579
## SRR1039521 GSM1275875  N061011      trt    untrt SRR1039521        98  SRX384358 SRS508580
##               BioSample
##                <factor>
## SRR1039508 SAMN02422669
## SRR1039509 SAMN02422675
## SRR1039512 SAMN02422678
## SRR1039513 SAMN02422670
## SRR1039516 SAMN02422682
## SRR1039517 SAMN02422673
## SRR1039520 SAMN02422683
## SRR1039521 SAMN02422677

注意:此處得到的資料需要採用EDSeq2包進行差異分析,所以不對資料進行標準化,切記。

差異表達基因分析

我們採用 DESeq2 包進行,差異表達基因的分析

# 此步採用 airway 包自帶的se資料進行後續操作,可以忽略。如果沒有進行上面的步驟也可以直接採用下面的資料進行後續操作。
data("airway")
se <- airway
library("DESeq2")
dds <- DESeqDataSet(se, design = ~ cell + dex) # design 引數為 formula,此處為cell和dex兩個因素,~ cell + dex表示我們想控制cell研究dex的影響。

採用DESeqDataSetFromMatrix函式從matrix中獲取資料

countdata <- assay(se)  # 可以根據自己的需要填充自己的資料(matrix格式),這裡以assay(se)為例
class(countdata)
head(countdata)
coldata <- colData(se)
(ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
                                 colData = coldata,
                                 design = ~ cell + dex))
dds$dex <- relevel(dds$dex, "untrt") # 將 untrt 定義為dex因素的第一水平,隨後的foldchange 將採用 trt/untrt
dds <- DESeq(dds)
(res <- results(dds)) # 得到結果,可以根據padj來挑選合適的差異表達基因,log2FoldChange來確定基因上調還是下調,pvalue的校正採用了Benjamini-Hochberg方法,具體見 ?p.adjust
## log2 fold change (MAP): dex trt vs untrt 
## Wald test p-value: dex trt vs untrt 
## DataFrame with 20 rows and 6 columns
##                   baseMean log2FoldChange      lfcSE       stat       pvalue         padj
##                  <numeric>      <numeric>  <numeric>  <numeric>    <numeric>    <numeric>
## ENSG00000009724   41.75438     -0.9605961 0.25246721  -3.804835 1.418988e-04 3.783967e-04
## ENSG00000116649 1222.28667      0.2071285 0.07112287   2.912263 3.588209e-03 7.176417e-03
## ENSG00000120942  280.89869      0.1403078 0.10305342   1.361505 1.733540e-01 2.311387e-01
## ENSG00000120948 2698.77923     -0.7665547 0.14936634  -5.132044 2.866120e-07 1.146448e-06
## ENSG00000171819  188.08482      4.3627872 0.57110974   7.639140 2.186777e-14 1.749421e-13
## ...                    ...            ...        ...        ...          ...          ...
## ENSG00000238199  0.3913511     -0.3733789  1.6308428 -0.2289484   0.81890903           NA
## ENSG00000253086  0.1518861     -1.0350645  1.5729023 -0.6580603   0.51049939           NA
## ENSG00000264181  0.0000000             NA         NA         NA           NA           NA
## ENSG00000271794  0.0000000             NA         NA         NA           NA           NA
## ENSG00000271895 34.9593217     -0.5451150  0.2920692 -1.8663899   0.06198683   0.09917893
mcols(res, use.names=TRUE)
## DataFrame with 6 rows and 2 columns
##                        type                               description
##                 <character>                               <character>
## baseMean       intermediate mean of normalized counts for all samples
## log2FoldChange      results  log2 fold change (MAP): dex trt vs untrt
## lfcSE               results          standard error: dex trt vs untrt
## stat                results          Wald statistic: dex trt vs untrt
## pvalue              results       Wald test p-value: dex trt vs untrt
## padj                results                      BH adjusted p-values
summary(res)
## 
## out of 16 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)     : 2, 12% 
## LFC < 0 (down)   : 3, 19% 
## outliers [1]     : 0, 0% 
## low counts [2]   : 8, 50% 
## (mean count < 12.1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

儲存資料

write.csv(res, file = '/your/path/')

sessionInfo

sessionInfo()
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.3 (Yosemite)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    graphics  grDevices utils     datasets  stats     methods   base     
## 
## other attached packages:
##  [1] GenomicAlignments_1.4.1   GenomicFeatures_1.20.1    AnnotationDbi_1.30.1     
##  [4] Biobase_2.28.0            knitr_1.10.5              BiocStyle_1.6.0          
##  [7] Rsamtools_1.20.4          Biostrings_2.36.1         XVector_0.8.0            
## [10] airway_0.102.0            DESeq2_1.8.1              RcppArmadillo_0.5.200.1.0
## [13] Rcpp_0.11.6               GenomicRanges_1.20.5      GenomeInfoDb_1.4.0       
## [16] IRanges_2.2.4             S4Vectors_0.6.0           BiocGenerics_0.14.0      
## [19] readr_0.1.1               sqldf_0.4-10              RSQLite_1.0.0            
## [22] DBI_0.3.1                 gsubfn_0.6-6              proto_0.3-10             
## [25] dplyr_0.4.1               plyr_1.8.3               
## 
## loaded via a namespace (and not attached):
##  [1] splines_3.2.0        Formula_1.2-1        assertthat_0.1       latticeExtra_0.6-26 
##  [5] yaml_2.1.13          lattice_0.20-31      chron_2.3-45         digest_0.6.8        
##  [9] RColorBrewer_1.1-2   colorspace_1.2-6     htmltools_0.2.6      XML_3.98-1.2        
## [13] biomaRt_2.24.0       genefilter_1.50.0    zlibbioc_1.14.0      xtable_1.7-4        
## [17] snow_0.3-13          scales_0.2.5         BiocParallel_1.2.3   annotate_1.46.0     
## [21] ggplot2_1.0.1        nnet_7.3-9           survival_2.38-1      magrittr_1.5        
## [25] evaluate_0.7         MASS_7.3-40          foreign_0.8-63       tools_3.2.0         
## [29] formatR_1.2          stringr_1.0.0.9000   munsell_0.4.2        locfit_1.5-9.1      
## [33] cluster_2.0.1        lambda.r_1.1.7       futile.logger_1.4.1  grid_3.2.0          
## [37] RCurl_1.95-4.6       bitops_1.0-6         tcltk_3.2.0          rmarkdown_0.7       
## [41] gtable_0.1.2         reshape2_1.4.1       gridExtra_0.9.1      rtracklayer_1.28.4  
## [45] Hmisc_3.16-0         futile.options_1.0.0 stringi_0.4-1        geneplotter_1.46.0  
## [49] rpart_4.1-9          acepack_1.3-3.3
library(knitr)
knit('/Users/lipidong/baiduyun/work/RFile/MarkDown/funSet.Rmd', output = '~/learn/blog/_posts/2015-06-17-RNA-Seq.md')