PiGx scRNAseq
This report was generated with PiGx-scRNAseq version 0.0.1.
sceRdsFile = params$sceRdsFile
workdir = params$workdir
outFile = params$outFile
covariates = gsub(' ', '', unlist(strsplit(x = params$covariates, split = ',')))
inputParameterDesc = c('RDS format file containing a SingleCellExperiment object',
'Working directory',
'Path to HTML report output',
'Covariates to use when plotting (PCA and t-SNE)'
)
inputParameterValues = c(sceRdsFile,
workdir,
outFile,
paste0(covariates, collapse = ', '))
inputSettings = data.frame(parameters = inputParameterDesc,
values = inputParameterValues,
stringsAsFactors = FALSE)
DT::datatable(data = inputSettings,
extensions = 'FixedColumns',
options = list(fixedColumns = TRUE,
scrollX = TRUE,
pageLength = length(inputParameterValues),
dom = 't'))
First a pre-generated SingleCellExperiment object is read.
This object must minimally have the following assays
:
cnts
object normalized into counts per million in log2 scale)cpm
)And the following reduced dimension
datasets:
sce = readRDS(file = sceRdsFile)
print(sce)
## class: SingleCellExperiment
## dim: 46603 31471
## metadata(0):
## assays(3): cnts cpm scale
## rownames: NULL
## rowData names(8): Genes ndetected ... row_variability geneName
## colnames: NULL
## colData names(12): sample_name cell_id ... mean_expr
## CellCyclePhase
## reducedDimNames(2): PCA TSNE
## spikeNames(0):
plotCellStats = function(df, x, y, label_x, label_y, title) {
p = ggplot(df, aes_string(x=x, y=y)) +
geom_boxplot() +
labs(x = label_x, y = label_y, title = title) +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(angle = 90, hjust = 1))
return(p)
}
plotDesc = list('nGene' = c('Sample',
'Number of detected genes',
'Number of detected genes per cell'),
'max_gene' = c('Sample',
'Maximum gene expression per cell',
'Maximum gene expression'),
'mean_expr' = c('Sample',
'Average gene expression',
'Average gene expression per cell\nfor genes with >0 UMI'))
cellStatsPlots = lapply(names(plotDesc), function(y){
p = plotCellStats(df = as.data.frame(colData(sce)),
x = 'sample_name',
y = y,
label_x = plotDesc[[y]][1],
label_y = plotDesc[[y]][2],
title = plotDesc[[y]][3])
return(p)
})
names(cellStatsPlots) = paste(lapply(plotDesc, function(x) x[2]))
The plot displays the numbers of mapped reads (in millions) for total (green bars) and uniquely-mapped UMI (orange bars) for each sample.
The plot displays the numbers of mapped reads (in millions) categorized by annotation (as indicated by the legend) for each sample.
These are boxplots of cell statistics.
Number of detected genes displays the distributions of numbers of detected genes per cell. Maximum gene expression per cell displays the distributions of maximum gene expression per cell. Average gene expression displays the distributions of average gene expression for genes with at least 1 UMI per cell.
The following plots visualize the cells in the principle component space, for the pairs of components indicated in the axes. These plots can be used to assess the data structure as well as to detect clusters, trends, outliers, and batch effects. The sample points are colored by the feature indicated in the corresponding tab title.
reducedDimPlot = function(df, dim1, dim2, title = NULL, color_by = NULL, gradient = FALSE) {
p = ggplot2::ggplot(df, aes_string(x = dim1, y = dim2)) +
geom_point(aes_string(color = color_by), size = 0.5, alpha = 0.5) +
labs(title = title)
if (gradient == TRUE) {
p = p + scale_color_gradient2(midpoint = 3.3)
}
return(p)
}
colData(sce)$log10nGene = log10(colData(sce)$nGene)
gradientList = c('log10nGene')
covariateList = c(covariates, gradientList)
if('CellCyclePhase' %in% colnames(colData(sce))) {
covariateList = c(covariateList, 'CellCyclePhase')
}
pcaData = as.data.table(cbind(colData(sce), sce@reducedDims$PCA))
pcaPlotList <- lapply(covariateList, function(cov) {
pL<- lapply(paste0('PC', c(2:5)), function(pc) {
myTitle <- paste('PC1 vs', pc)
reducedDimPlot(df = pcaData,
dim1 = 'PC1',
dim2 = pc,
title = myTitle,
color_by = cov,
gradient = cov %in% gradientList)
})
names(pL) <- paste0('PC1 vs PC',c(2:5))
return(pL)
}
)
names(pcaPlotList) = covariateList
tsneData = as.data.table(cbind(colData(sce), reducedDim(sce, 'TSNE')))
tsnePlotList = lapply(covariateList, function(cov) {
reducedDimPlot(df = tsneData, dim1 = 'V1', dim2 = 'V2', title = 'tSNE_1 vs tSNE_2', color_by = cov)
})
names(tsnePlotList) = covariateList
The following plots show the scores obtained through t-SNE dimensionality reduction. These plots can be used to assess the data structure as well as to detect clusters, trends, outliers, and batch effects. The sample points are colored by the feature indicated in the corresponding title.
plotPCAContribution = function(sce, pc, n = 20){
p = attr(reducedDim(sce, 'PCA'), 'gene.loadings') %>%
as.data.frame() %>%
dplyr::mutate(gene_id = rownames(.)) %>%
dplyr::mutate_(X = pc) %>%
dplyr::arrange(-abs(X)) %>%
head(n=n) %>%
dplyr::arrange(X) %>%
dplyr::mutate(gene_name = rowData(sce)[match(gene_id, rowData(sce)$Genes),]$geneName) %>%
dplyr::mutate(gene_name = factor(gene_name, levels=gene_name)) %>%
ggplot2::ggplot(aes_string(x = pc, y = 'gene_name')) +
geom_point() +
ylab('Gene Name')+
ggtitle(paste('Top',n,'genes for', pc)) + theme_bw()
return(p)
}
pcaContributionPlots <- lapply(paste0('PC', 1:5), function(pc) {
plotPCAContribution(sce = sce, pc = pc, n = 20)
})
names(pcaContributionPlots) <- paste0('PC', 1:5)
The following plots display the scores of top genes contributing to the given principle components. x-axis indicates the contribution for the given principle component while y-axis indicates the genes contributing significantly to the component.
for (i in 1:length(pcaContributionPlots)) {
cat("#### ",names(pcaContributionPlots)[[i]],"\n")
print(pcaContributionPlots[[i]])
cat('\n\n')
}
plotPCAcontributorsHeatmap = function(workdir, sce, cellCount = 500, pc, n = 20){
M <- attr(reducedDim(sce, 'PCA'), 'gene.loadings')
if(n > nrow(M)) {
n = nrow(M)
}
topGenes = names(sort(x = M[,pc], decreasing = TRUE))[1:n]
if(ncol(sce) < cellCount) {
cellCount = ncol(sce)
}
select <- sample(1:ncol(sce), cellCount)
exprMatrix <- assay(sce, 'scale')[match(topGenes, rowData(sce)$Genes),select]
annotationDf <- as.data.frame(colData(sce)[select,covariateList])
rownames(annotationDf) <- colData(sce)[select,]$cell_id
rownames(exprMatrix) <- rowData(sce)[match(topGenes, rowData(sce)$Genes),]$geneName
colnames(exprMatrix) <- rownames(annotationDf)
p <- pheatmap::pheatmap(mat = exprMatrix,
show_rownames = TRUE,
annotation_col = annotationDf,
show_colnames = FALSE)
filename = file.path(workdir, paste0("heatmap", pc, '.png'))
png(filename = filename, width = 800, height = 500, units = 'px')
grid::grid.newpage()
grid::grid.draw(p$gtable)
dev.off()
return(filename)
}
pcaContributorHeatmaps <- lapply(paste0('PC', 1:5), function(pc) {
plotPCAcontributorsHeatmap(workdir = workdir, sce = sce, pc = pc, n = 20, cellCount = 100)
})
names(pcaContributorHeatmaps) <- paste0('PC', 1:5)
The following plots display heatmaps of the expression values of top contributing genes to the given principle component. Columns are cells and rows are the top genes. CellCyclePhase indicates cell cycle phases classified based on gene expression data. log10nGene indicates log10-transformed numbers of genes detected per cell.
for (i in 1:length(pcaContributorHeatmaps)) {
cat("#### ",names(pcaContributorHeatmaps)[[i]],"\n")
cat(paste0("![](",pcaContributorHeatmaps[[i]],")"))
cat('\n\n')
}
plotGeneExpressionOnPCA <- function(sce, geneNames, assay.type = 'scale', dim1, dim2, reducedDimType = 'PCA') {
geneNames <- intersect(geneNames, rowData(sce)$geneName)
if(length(geneNames) > 0){
#find indices of given genes in the rowData
select <- match(geneNames, rowData(sce)$geneName)
#subset assay matrix for the selected genes
assayData <- assay(sce, assay.type)[select,]
rownames(assayData) <- geneNames
dt <- cbind(data.table::as.data.table(reducedDim(sce, reducedDimType)[,c(dim1, dim2)]),
data.table::as.data.table(t(assayData)))
mdt = reshape2::melt(dt, id.vars = 1:2)
p = ggplot2::ggplot(mdt, aes_string(x = colnames(mdt)[1],
y = colnames(mdt)[2])) +
geom_point(aes(color = value), size = 0.2, alpha = 0.5) +
facet_wrap(~ variable, ncol = 4) +
theme(axis.text.x = element_text(angle = 90),
strip.text = element_text(size = 10)) +
scale_color_gradient(low = 'grey', high = 'blue')
return(p)
} else{
stop("None of the provided gene ids match the gene ids
in the SingleCellExperiment object")
}
}
highlyVariableGenes <- rowData(sce)[order(rowData(sce)$fitted_variability, decreasing = T),][1:20,]$geneName
myPlotList = list('PC1_vs_PC2' = 1:2,
'PC3_vs_PC4' = 3:4,
'PC5_vs_PC6' = 5:6,
'PC7_vs_PC8' = 7:8)
geneProjectionPlots = lapply(myPlotList, function(x) {
plotGeneExpressionOnPCA(sce = sce,
geneNames = highlyVariableGenes,
assay.type = 'scale',
reducedDimType = 'PCA',
dim1 = x[1], dim2 = x[2])
})
names(geneProjectionPlots) = names(myPlotList)
The following plots display the expression of the top variable genes in the dataset (accross all cells). Each dot is a cell, displayed in a space defined by the principle component analysis. The axes show the position of each cell along the two given principle components. The color of each cell indicates the cell-type-specific gene expression value (shown in the value legend).
for (i in 1:length(geneProjectionPlots)) {
cat("#### ",names(geneProjectionPlots)[[i]],"\n")
print(geneProjectionPlots[[i]])
cat('\n\n')
}
myPlotList = list('tSNE-1 vs tSNE-2' = 1:2)
geneProjectionPlots = lapply(myPlotList, function(x) {
plotGeneExpressionOnPCA(sce = sce,
geneNames = highlyVariableGenes,
assay.type = 'scale',
reducedDimType = 'TSNE',
dim1 = x[1], dim2 = x[2])
})
names(geneProjectionPlots) = names(myPlotList)
The following plots display the expression of the top variable genes in the dataset (accross all cells). Each dot is a cell, displayed in a space defined by the t-SNE analysis. The axes show the position of each cell along the two given t-SNE components. The color of each cell indicates the cell-type-specific gene expression value (shown in the value legend).
for (i in 1:length(geneProjectionPlots)) {
cat("#### ",names(geneProjectionPlots)[[i]],"\n")
print(geneProjectionPlots[[i]])
cat('\n\n')
}
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-unknown-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /gnu/store/ccad09zgj85251ksp5xd71ds3cz3f7gp-openblas-0.2.20/lib/libopenblasp-r0.2.20.so
##
## locale:
## [1] en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] HDF5Array_1.8.0 rhdf5_2.24.0
## [3] bindrcpp_0.2.2 pheatmap_1.0.10
## [5] dplyr_0.7.5 data.table_1.11.4
## [7] SingleCellExperiment_1.2.0 SummarizedExperiment_1.10.1
## [9] DelayedArray_0.6.0 BiocParallel_1.14.1
## [11] matrixStats_0.53.1 Biobase_2.40.0
## [13] GenomicRanges_1.32.3 GenomeInfoDb_1.16.0
## [15] IRanges_2.14.10 S4Vectors_0.18.2
## [17] BiocGenerics_0.26.0 DT_0.4
## [19] cowplot_0.9.2 ggplot2_2.2.1
## [21] rmarkdown_1.9
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.4 reshape2_1.4.3
## [3] purrr_0.2.5 lattice_0.20-35
## [5] colorspace_1.3-2 htmltools_0.3.6
## [7] yaml_2.1.19 rlang_0.2.1
## [9] pillar_1.2.3 later_0.7.2
## [11] glue_1.2.0 RColorBrewer_1.1-2
## [13] GenomeInfoDbData_0.99.1 plyr_1.8.4
## [15] bindr_0.1.1 stringr_1.3.1
## [17] munsell_0.4.3 gtable_0.2.0
## [19] htmlwidgets_1.2 evaluate_0.10.1
## [21] labeling_0.3 knitr_1.20
## [23] httpuv_1.4.3 crosstalk_1.0.0
## [25] Rcpp_0.12.17 xtable_1.8-2
## [27] scales_0.5.0 backports_1.1.2
## [29] promises_1.0.1 jsonlite_1.5
## [31] XVector_0.20.0 mime_0.5
## [33] digest_0.6.15 stringi_1.2.2
## [35] shiny_1.1.0 grid_3.5.0
## [37] rprojroot_1.3-2 tools_3.5.0
## [39] magrittr_1.5 lazyeval_0.2.1
## [41] RCurl_1.95-0.1.2 tibble_1.4.2
## [43] pkgconfig_2.0.1 Matrix_1.2-14
## [45] assertthat_0.2.0 Rhdf5lib_1.2.1
## [47] R6_2.2.2 compiler_3.5.0