Computationally identify biologically relevant alterations

run_CIBRA calculates an impact score from provided expression data and defined definitions

Usage

run_CIBRA(
  data,
  definition_matrix,
  columns,
  control_definition,
  confidence = 0.1,
  iterations = 0,
  covariates = c(),
  method = "DESeq2",
  parallel = FALSE,
  speedup = FALSE,
  min_n = 10,
  outputDir = FALSE,
  sample_column = NULL
)

Arguments

data: RNA count dataframe with genes as rows and samples as columns
definition_matrix: design dataframe of the comparisons to make, binary columns (factors) with an indication of the groups to test and samples as rownames.
columns: columns from the definition_matrix to assess with respect to the control definition
control_definition: Definition term that will be used as reference for the comparison (e.g. WT)
confidence: alpha threshold to calculate the proportion (default is 0.1)
iterations: number of iterations for random permutation (recommendation is 50 iterations per definition)
covariates: list of column names from the definition matrix to use as covariates (supported only with DESeq2)
method: method to use for differential expression analysis. options: 'DESeq2', 'edgeR' and 'limma'. Defaults to 'DESeq2'
parallel: boolean state to operate in parallel mode. Defaults to FALSE
speedup: boolean state to run DESeq2 in speedup mode. Defaults to FALSE
min_n: minimal number of samples to consider for analysis. Defaults to the recommended minimum number of samples: 10
outputDir: path for result files, if FALSE, no results will be written. Defaults to FALSE
sample_column: name of the column containing the sample names, if NULL, rownames will be used

Value

returns a list containing a dataframe with the signal measures and associated metadata and associated results generated by differential expression analysis

Examples


require(dplyr)
#> Loading required package: dplyr
#> 
#> Attaching package: ‘dplyr’
#> The following objects are masked from ‘package:stats’:
#> 
#>     filter, lag
#> The following objects are masked from ‘package:base’:
#> 
#>     intersect, setdiff, setequal, union

# load transcription data and genomic alterations
count_data <- CIBRA::TCGA_CRC_rna_data
mutation_profile <- CIBRA::TCGA_CRC_non_silent_mutation_profile

# select BRAF as example
goi <- c("BRAF", "APC", "TP53")

# filter mutation profile for only selected genes
definition_matrix <- mutation_profile[goi]

# select only 30 samples for the cases (BRAF mutant) and controls (BRAF WT) for the sake of speed of the example
# you can remove this limit if you want to run the full comparison

definition_matrix <- definition_matrix %>% tibble::rownames_to_column('sample') %>% group_by(BRAF) %>% sample_n(30) %>% tibble::column_to_rownames('sample')

#' # fix sample names for intersect between the two datatypes
rownames(definition_matrix) <- stringr::str_replace_all(rownames(definition_matrix), "-", ".")

# select intersect between definition matrix and count data
sample_list <- intersect(rownames(definition_matrix), colnames(count_data))

# select data and definition matrix as the intersect
count_data <- count_data[sample_list]
definition_matrix <- definition_matrix[sample_list,]

# select BRAF
columns <- c("BRAF") # if more column were provided in the definition_matrix

# set up parameters for CIBRA
control_definition <- "WT" # reference condition

confidence <- 0.1 # confidence threshold for the proportion calculation

# set permutation to 0 to skip permutations

iterations = 0 # number of permutation iterations to be performed per condition

# register(SnowParam(2)) # set number of cores to be used for parallel processing

# run CIBRA with default DE analysis method (DESeq2)

CIBRA_res <- run_CIBRA(count_data, as.data.frame(definition_matrix), columns = columns, control_definition = control_definition, confidence = confidence, iterations = iterations, parallel = FALSE) # set parallel to TRUE for parallel processing
#> [1] "SNV"
#> [1] "BRAF" "SNV" 
#> estimating size factors
#> estimating dispersions
#> gene-wise dispersion estimates
#> mean-dispersion relationship
#> final dispersion estimates
#> fitting model and testing
#> -- replacing outliers and refitting for 2735 genes
#> -- DESeq argument 'minReplicatesForReplace' = 7 
#> -- original counts are preserved in counts(dds)
#> estimating dispersions
#> fitting model and testing

# calculate the pvalue by comparing to a reference distribution generated with DESeq2
perm_data <- CIBRA::perm_dist_crc_tcga

# use the gamma test to also calculate a fitted p-value
CIBRA_res_stat <-  permutation_test(CIBRA_res$CIBRA_results, perm_data, test = "gamma")