This vignette presents an example workflow for CRSO. If a parallel backend is registered, CRSO will make use of all available registered workers. If not, it can still be run sequentially.
library(crso)
#> Loading required package: foreach
### Load example dataset consisting of TCGA melanoma (SKCM) patients.
data(skcm)
list2env(skcm.list,envir=globalenv())
#> <environment: R_GlobalEnv>
names(skcm.list) ### load D, P and cnv.dictionary
#> [1] "D" "P" "cnv.dictionary"
Q <- log10(P) ### Q is the penalty matrix derived from P
The parameter values used are not the default values that are recommended in the CRSO paper, because the computation time may be excessive depending on the parallel availability. Instead, these were chosen to demonstrate how to use all of the functions in the package. The recommended parameter values are indicated in parentheses.
rule.thresh <- .05 # Minimum percentage of samples covered by each rule in the rule library. (Default = .03)
spr <- 4 # Phase 1 random sets per rule (default = 40, recommend at least 40)
trn <- 40 # Phase 1 stop criteria (default = 16, recommend at most 24)
k.max <- 6 # Maximum RS size (default is 12)
max.stored <- 1000 # Max RS stored per K in phases 2 and 3 (no default, recommend at least 10^4)
max.nrs.ee <- 1000 # Phase 2 max rs per K (default is 10^5, recommend at least 10^5)
max.compute <- 10*max.nrs.ee # Max raw im size for phase 2 (default is 10^6, recommend at least 10*max.nrs.ee)
max.nrs.borrow <- 1000 # Phase 3 max rs per K (default is 10^5, recommend at least 10^5)
filter.thresh <- .03 # minimum assignment threshold per rule set (default is .03)
### Generalized core parameters:
num.gc.iter <- 10 # Number GC iterations (default is 100)
num.gc.eval <- 100 # Rulesets evaluated per K per GC iter (default is 1000)
set.seed(100)
rm.full <- buildRuleLibrary(D,rule.thresh) # build rule library
rm.ordered <- makePhaseOneOrderedRM(D,rm.full,spr,Q,trn,shouldPrint = TRUE) # run phase 1
#> [1] "Starting rules = 60"
#> Warning: executing %dopar% sequentially: no parallel backend registered
#> [1] "Remaining rules = 54"
#> [1] "Remaining rules = 49"
#> [1] "Remaining rules = 45"
#> [1] "Remaining rules = 41"
#> [1] "Remaining rules = 40"
#> Time difference of 0.3392831 mins
pool.sizes <- getPoolSizes(rm.ordered,k.max,max.nrs.ee,max.compute)
### The pool size for each K is the number of rules considered for exhaustive evaluation in phase 2.
til.p2 <- makePhaseTwoImList(D,Q,rm.ordered,k.max,pool.sizes,max.stored,shouldPrint = TRUE) # Run phase 2
#> [1] "Starting Phase 2: Exhaustive Evaluation"
#> [1] "Evaluation time for k = 2: 0.01989 min"
#> [1] "Evaluation time for k = 3: 0.02246 min"
#> [1] "Evaluation time for k = 4: 0.01906 min"
#> [1] "Evaluation time for k = 5: 0.02347 min"
#> [1] "Evaluation time for k = 6: 0.03056 min"
#> [1] "Total Phase 2 Time: 0.1189 min"
### TIL stands for top index list. The output of phase two is a list of top index matrices for each k. Each index matrix contains the rule sets ordered by performance. For example the best performing rule set of size 3 will be the first row of the K.3 index matrix. For K=1 the index matrix is actually a vector.
til.p3 <- makePhaseThreeImList(D,Q,rm.ordered,til.p2,pool.sizes,max.stored,max.nrs.borrow,shouldPrint = TRUE)
#> [1] "Starting Phase 3: neighbor expansion"
#> [1] "Updated top im for K = 3, time = 0.07684 min"
#> [1] "Updated top im for K = 4, time = 0.1109 min"
#> [1] "Updated top im for K = 5, time = 0.1249 min"
#> [1] "Updated top im for K = 6, time = 0.1379 min"
#> [1] "Total Phase 3 Time: 0.4505 min"
### Make TIL for phase 3 by updating phase two til to consider neighbor rule sets.
til.filtered <- makeFilteredImList(D,Q,rm.ordered,til.p3,filter.thresh)
### Filter the phase 3 results to only include rule sets for which every rule is assigned to a minimum percentage of samples, default is 3%
tpl.filtered <- evaluateListOfIMs(D,Q,rm.ordered,til.filtered)
### Get top performance list (TPL), which contains the objective function score of all of the rule sets in til.filtered
best.rs.list <- getBestRsList(rm.ordered,tpl.filtered,til.filtered)
### This is a list of the best rule sets for all K
core.K <- getCoreK(D,rm.ordered,tpl.filtered,til.filtered) # Determine core K
core.ruleset <- getCoreRS(D,rm.ordered,tpl.filtered,til.filtered) # Extract core rule set
print(core.ruleset)
#> r2 r1 r3
#> "CDKN2A-MD.NRAS-M" "BRAF-M.CDKN2A-MD" "BRAF-M.PTEN-MD"
#> r29 r13 r32
#> "NRAS-M.TP53-M" "BRAF-M.HIPK2/TBXAS1-A" "BRAF-M.RN7SKP254-A"
list.subset.cores <- makeSubCoreList(D,Q,rm.ordered,til.filtered,num.gc.iter,num.gc.eval)
#> [1] "Subset Core Iteration = 1"
#> [1] "Subset Core Iteration = 2"
#> [1] "Subset Core Iteration = 3"
#> [1] "Subset Core Iteration = 4"
#> [1] "Subset Core Iteration = 5"
#> [1] "Subset Core Iteration = 6"
#> [1] "Subset Core Iteration = 7"
#> [1] "Subset Core Iteration = 8"
#> [1] "Subset Core Iteration = 9"
#> [1] "Subset Core Iteration = 10"
#> Time difference of 8.281652 secs
### list.subset.cores is a list of core rule set derived from subsampling iterations
gcr.df <- getGCRs(list.subset.cores) # Generalized core rules
print(gcr.df)
#> GCR Confidence Confidence.Level
#> 10 CDKN2A-MD.NRAS-M 100 High
#> 9 BRAF-M.PTEN-MD 100 High
#> 8 BRAF-M.CDKN2A-MD 100 High
#> 7 NRAS-M.TP53-M 80 High
#> 6 BRAF-M.HIPK2/TBXAS1-A 60 Medium
#> 5 BRAF-M.RN7SKP254-A 30 Low
#> 4 ARID2-M.NRAS-M 30 Low
#> 3 BRAF-M.TP53-M 20 Low
#> 2 NRAS-M.SMYD3-A 10 Low
#> 1 HULC-A.NRAS-M 10 Low
gcd.df <- getGCDs(list.subset.cores) # Generalized core duos
print(gcd.df)
#> GCD Confidence Confidence.Level
#> 10 CDKN2A-MD.NRAS-M 100 High
#> 9 BRAF-M.PTEN-MD 100 High
#> 8 BRAF-M.CDKN2A-MD 100 High
#> 7 NRAS-M.TP53-M 80 High
#> 6 BRAF-M.HIPK2/TBXAS1-A 60 Medium
#> 5 BRAF-M.RN7SKP254-A 30 Low
#> 4 ARID2-M.NRAS-M 30 Low
#> 3 BRAF-M.TP53-M 20 Low
#> 2 NRAS-M.SMYD3-A 10 Low
#> 1 HULC-A.NRAS-M 10 Low
gce.df <- getGCEs(list.subset.cores) # Generalized core events
print(gce.df)
#> GCE Confidence Confidence.Level
#> 10 PTEN-MD 100 High
#> 9 NRAS-M 100 High
#> 8 CDKN2A-MD 100 High
#> 7 BRAF-M 100 High
#> 6 TP53-M 90 High
#> 5 HIPK2/TBXAS1-A 60 Medium
#> 4 RN7SKP254-A 30 Low
#> 3 ARID2-M 30 Low
#> 2 SMYD3-A 10 Low
#> 1 HULC-A 10 Low