Skip to contents

run_CIBRA calculates an impact score from provided expression data and defined definitions

Usage

run_CIBRA(
  data,
  definition_matrix,
  columns,
  control_definition,
  confidence = 0.1,
  iterations = 0,
  covariates = c(),
  method = "DESeq2",
  parallel = FALSE,
  speedup = FALSE,
  min_n = 10,
  outputDir = FALSE,
  sample_column = NULL
)

Arguments

data

RNA count dataframe with genes as rows and samples as columns

definition_matrix

design dataframe of the comparisons to make, binary columns (factors) with an indication of the groups to test and samples as rownames.

columns

columns from the definition_matrix to assess with respect to the control definition

control_definition

Definition term that will be used as reference for the comparison (e.g. WT)

confidence

alpha threshold to calculate the proportion (default is 0.1)

iterations

number of iterations for random permutation (recommendation is 50 iterations per definition)

covariates

list of column names from the definition matrix to use as covariates (supported only with DESeq2)

method

method to use for differential expression analysis. options: 'DESeq2', 'edgeR' and 'limma'. Defaults to 'DESeq2'

parallel

boolean state to operate in parallel mode. Defaults to FALSE

speedup

boolean state to run DESeq2 in speedup mode. Defaults to FALSE

min_n

minimal number of samples to consider for analysis. Defaults to the recommended minimum number of samples: 10

outputDir

path for result files, if FALSE, no results will be written. Defaults to FALSE

sample_column

name of the column containing the sample names, if NULL, rownames will be used

Value

returns a list containing a dataframe with the signal measures and associated metadata and associated results generated by differential expression analysis

Examples


require(dplyr)
#> Loading required package: dplyr
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union

# load transcription data and genomic alterations
count_data <- CIBRA::TCGA_CRC_rna_data
mutation_profile <- CIBRA::TCGA_CRC_non_silent_mutation_profile

# select BRAF as example
goi <- c("BRAF", "APC", "TP53")

# filter mutation profile for only selected genes
definition_matrix <- mutation_profile[goi]

# select only 30 samples for the cases (BRAF mutant) and controls (BRAF WT) for the sake of speed of the example
# you can remove this limit if you want to run the full comparison

definition_matrix <- definition_matrix %>% tibble::rownames_to_column('sample') %>% group_by(BRAF) %>% sample_n(30) %>% tibble::column_to_rownames('sample')

#' # fix sample names for intersect between the two datatypes
rownames(definition_matrix) <- stringr::str_replace_all(rownames(definition_matrix), "-", ".")

# select intersect between definition matrix and count data
sample_list <- intersect(rownames(definition_matrix), colnames(count_data))

# select data and definition matrix as the intersect
count_data <- count_data[sample_list]
definition_matrix <- definition_matrix[sample_list,]

# select BRAF
columns <- c("BRAF") # if more column were provided in the definition_matrix

# set up parameters for CIBRA
control_definition <- "WT" # reference condition

confidence <- 0.1 # confidence threshold for the proportion calculation

# set permutation to 0 to skip permutations

iterations = 0 # number of permutation iterations to be performed per condition

# register(SnowParam(2)) # set number of cores to be used for parallel processing

# run CIBRA with default DE analysis method (DESeq2)

CIBRA_res <- run_CIBRA(count_data, as.data.frame(definition_matrix), columns = columns, control_definition = control_definition, confidence = confidence, iterations = iterations, parallel = FALSE) # set parallel to TRUE for parallel processing
#> [1] "SNV"
#> [1] "BRAF" "SNV" 
#> estimating size factors
#> estimating dispersions
#> gene-wise dispersion estimates
#> mean-dispersion relationship
#> final dispersion estimates
#> fitting model and testing
#> -- replacing outliers and refitting for 2735 genes
#> -- DESeq argument 'minReplicatesForReplace' = 7 
#> -- original counts are preserved in counts(dds)
#> estimating dispersions
#> fitting model and testing

# calculate the pvalue by comparing to a reference distribution generated with DESeq2
perm_data <- CIBRA::perm_dist_crc_tcga

# use the gamma test to also calculate a fitted p-value
CIBRA_res_stat <-  permutation_test(CIBRA_res$CIBRA_results, perm_data, test = "gamma")