Basic DE downstream of piscem

Author

Rob Patro

Note: This basic tutorial follows the same structure as the excellent RNA-seq workflow: gene-level exploratory analysis and differential expression vignette, but demonstrates the tximport support for piscem, and also strips out a lot of the interesting exploratory analysis to just get to a basic DE result.

Loading the required packages

In order to perform our analysis, we will need a few packages installed. The below code assumes that all of them are already installed, with the exception of tximport, which we need to install from GitHub, as the support for piscem is not yet available upstream in the current BioConductor release.

devtools::install_github("https://github.com/thelovelab/tximport.git")
library(tximport)
library(dplyr)
library(arrow)
library("DESeq2")
library("PoiClaClu")
library("pheatmap")
library("RColorBrewer")

Loading the sample table and transcript-to-gene map

We’ll read in the run table that contains the relevant information about the samples.

dir <- file.path('.')
coldata <- read.delim(file.path(dir, "SraRunTable.txt"), sep=',')
colnames(coldata)

 [1] "Run"                 "Assay.Type"          "AvgSpotLen"         
 [4] "Bases"               "BioProject"          "BioSample"          
 [7] "Bytes"               "cell_line"           "cell_type"          
[10] "Center.Name"         "Consent"             "DATASTORE.filetype" 
[13] "DATASTORE.provider"  "DATASTORE.region"    "ercc_mix"           
[16] "Experiment"          "GEO_Accession..exp." "Instrument"         
[19] "LibraryLayout"       "LibrarySelection"    "LibrarySource"      
[22] "Organism"            "Platform"            "ReleaseDate"        
[25] "create_date"         "version"             "Sample.Name"        
[28] "source_name"         "SRA.Study"           "tissue"             
[31] "treatment"

Then, we strip down the run table to just the most basic information.

samples_of_interest <- c("SRR1039508", "SRR1039509", "SRR1039512", "SRR1039513", "SRR1039516", "SRR1039517", "SRR1039520", "SRR1039521")
idx <- c("Run","cell_line","treatment")
coldata <- coldata[,idx]
colnames(coldata) <- c("run","cell","dex")
coldata <- coldata[coldata$run %in% samples_of_interest,]
coldata

          run    cell           dex
1  SRR1039508  N61311     Untreated
2  SRR1039509  N61311 Dexamethasone
5  SRR1039512 N052611     Untreated
6  SRR1039513 N052611 Dexamethasone
9  SRR1039516 N080611     Untreated
10 SRR1039517 N080611 Dexamethasone
13 SRR1039520 N061011     Untreated
14 SRR1039521 N061011 Dexamethasone

We’ll also reduce the labels to just treated (consisting of Dexamethasone, Albuterol and Albuterol + Dexamethasone) versus untreated.

coldata$dex <- recode_factor(coldata$dex, Untreated = "untrt", Dexamethasone = "trt")
levels(coldata$dex)

[1] "untrt" "trt"

coldata$dex <- relevel(coldata$dex, "untrt")
coldata$dex

[1] untrt trt   untrt trt   untrt trt   untrt trt  
Levels: untrt trt

We are doing a gene-level analysis, so we’ll need a transcript to gene mapping (that file can be obtained here).

tx2gene <- read.csv("t2g.csv")

Loading the `piscem` quantifications

The function we’ll use to load the quantifications.

load_quants <- function(dir, coldata, tx2gene) {
  files <- as.character(lapply(file.path(dir, "quant", coldata$run), function(x) { paste0(x, ".quant") }))
  names(files) <- coldata$run
  txi <- tximport::tximport(files, type="piscem", tx2gene=tx2gene, ignoreAfterBar=TRUE)
  txi
}

Now, we can call this function:

txi <- load_quants(dir, coldata, tx2gene)

reading in files with read_tsv

1 2 3 4 5 6 7 8 
summarizing abundance
summarizing counts
summarizing length
summarizing inferential replicates

Showing the heatmap

dds <- DESeqDataSetFromTximport(txi, coldata, design = ~cell + dex)

Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
design formula are characters, converting to factors

using counts and average transcript lengths from tximport

dds <- dds[ rowSums(counts(dds)) > 1, ]
dds

class: DESeqDataSet 
dim: 35156 8 
metadata(1): version
assays(2): counts avgTxLength
rownames(35156): ENSG00000000003.16 ENSG00000000419.14 ...
  ENSG00000292366.1 ENSG00000292372.1
rowData names(0):
colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
colData names(3): run cell dex

poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd )
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
         clustering_distance_rows = poisd$dd,
         clustering_distance_cols = poisd$dd,
         col = colors)

Computing DEGs

The compute_degs function below will do what is necessary to compute the actual differential result table using DESeq2.

compute_degs <- function(dds, coldata) {
  dds <- DESeq(dds)
  res <- results(dds)
  res 
}

We get the result by calling the function:

degs <- compute_degs(dds, coldata)

show the results:

degs <- degs[!is.na(degs$padj),]
degs[degs$padj < 0.05,]

log2 fold change (MLE): dex trt vs untrt 
Wald test p-value: dex trt vs untrt 
DataFrame with 3478 rows and 6 columns
                     baseMean log2FoldChange     lfcSE      stat      pvalue
                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
ENSG00000000003.16    745.167      -0.420638 0.1244011  -3.38130 7.21431e-04
ENSG00000000971.17   5721.921       0.461468 0.0804752   5.73429 9.79191e-09
ENSG00000001167.15    481.664      -0.529965 0.1450754  -3.65303 2.59164e-04
ENSG00000002834.19   8609.485       0.419376 0.0969321   4.32649 1.51502e-05
ENSG00000003096.15    414.807      -0.970928 0.1377851  -7.04669 1.83229e-12
...                       ...            ...       ...       ...         ...
ENSG00000291132.1    334.5893      -0.616538  0.148620  -4.14841 3.34799e-05
ENSG00000291143.1     11.5454       2.607125  0.848655   3.07207 2.12582e-03
ENSG00000291214.1    293.0346      -0.541151  0.177173  -3.05436 2.25539e-03
ENSG00000291237.1  14395.7583       0.509307  0.151184   3.36880 7.54971e-04
ENSG00000292339.1    379.0986       0.522643  0.140373   3.72324 1.96685e-04
                          padj
                     <numeric>
ENSG00000000003.16 5.54837e-03
ENSG00000000971.17 2.19155e-07
ENSG00000001167.15 2.28902e-03
ENSG00000002834.19 1.86395e-04
ENSG00000003096.15 6.36374e-11
...                        ...
ENSG00000291132.1  0.000382518
ENSG00000291143.1  0.013978534
ENSG00000291214.1  0.014716467
ENSG00000291237.1  0.005778280
ENSG00000292339.1  0.001805790

Loading the required packages

Loading the sample table and transcript-to-gene map

Loading the piscem quantifications

Showing the heatmap

Computing DEGs

Loading the `piscem` quantifications