This vignette presents an example workflow for CRSO. If a parallel backend is registered, CRSO will make use of all available registered workers. If not, it can still be run sequentially.

library(crso)
#> Loading required package: foreach
### Load example dataset consisting of TCGA melanoma (SKCM) patients.
data(skcm)
list2env(skcm.list,envir=globalenv())
#> <environment: R_GlobalEnv>
names(skcm.list) ### load D, P and cnv.dictionary
#> [1] "D"              "P"              "cnv.dictionary"
Q <- log10(P) ### Q is the penalty matrix derived from P

Specify parameters

The parameter values used are not the default values that are recommended in the CRSO paper, because the computation time may be excessive depending on the parallel availability. Instead, these were chosen to demonstrate how to use all of the functions in the package. The recommended parameter values are indicated in parentheses.

rule.thresh <- .05 # Minimum percentage of samples covered by each rule in the rule library. (Default = .03)
spr <- 4 # Phase 1 random sets per rule (default = 40, recommend at least 40) 
trn <- 40 # Phase 1 stop criteria (default = 16, recommend at most 24)
k.max <- 6 # Maximum RS size (default is 12)
max.stored <- 1000 # Max RS stored per K in phases 2 and 3 (no default, recommend at least 10^4)
max.nrs.ee <- 1000 # Phase 2 max rs per K (default is 10^5, recommend at least 10^5)
max.compute <- 10*max.nrs.ee # Max raw im size for phase 2 (default is 10^6, recommend at least 10*max.nrs.ee)
max.nrs.borrow <- 1000 # Phase 3 max rs per K (default is 10^5, recommend at least 10^5)
filter.thresh <- .03 # minimum assignment threshold per rule set (default is .03)
### Generalized core parameters:
num.gc.iter <- 10 # Number GC iterations (default is 100)
num.gc.eval <- 100 # Rulesets evaluated per K per GC iter (default is 1000)

Phase 1

set.seed(100)
rm.full <- buildRuleLibrary(D,rule.thresh) # build rule library
rm.ordered <- makePhaseOneOrderedRM(D,rm.full,spr,Q,trn,shouldPrint = TRUE) # run phase 1
#> [1] "Starting rules = 60"
#> Warning: executing %dopar% sequentially: no parallel backend registered
#> [1] "Remaining rules = 54"
#> [1] "Remaining rules = 49"
#> [1] "Remaining rules = 45"
#> [1] "Remaining rules = 41"
#> [1] "Remaining rules = 40"
#> Time difference of 0.3392831 mins

Phase 2

pool.sizes <- getPoolSizes(rm.ordered,k.max,max.nrs.ee,max.compute) 
### The pool size for each K is the number of rules considered for exhaustive evaluation in phase 2.
til.p2 <- makePhaseTwoImList(D,Q,rm.ordered,k.max,pool.sizes,max.stored,shouldPrint = TRUE) # Run phase 2
#> [1] "Starting Phase 2: Exhaustive Evaluation"
#> [1] "Evaluation time for k = 2: 0.01989 min"
#> [1] "Evaluation time for k = 3: 0.02246 min"
#> [1] "Evaluation time for k = 4: 0.01906 min"
#> [1] "Evaluation time for k = 5: 0.02347 min"
#> [1] "Evaluation time for k = 6: 0.03056 min"
#> [1] "Total Phase 2 Time: 0.1189 min"
### TIL stands for top index list. The output of phase two is a list of top index matrices for each k.  Each index matrix contains the rule sets ordered by performance. For example the best performing rule set of size 3 will be the first row of the K.3 index matrix. For K=1 the index matrix is actually a vector.

Phase 3 and Filtering

til.p3 <- makePhaseThreeImList(D,Q,rm.ordered,til.p2,pool.sizes,max.stored,max.nrs.borrow,shouldPrint = TRUE)
#> [1] "Starting Phase 3: neighbor expansion"
#> [1] "Updated top im for K = 3, time = 0.07684 min"
#> [1] "Updated top im for K = 4, time = 0.1109 min"
#> [1] "Updated top im for K = 5, time = 0.1249 min"
#> [1] "Updated top im for K = 6, time = 0.1379 min"
#> [1] "Total Phase 3 Time: 0.4505 min"
### Make TIL for phase 3 by updating phase two til to consider neighbor rule sets.

til.filtered <- makeFilteredImList(D,Q,rm.ordered,til.p3,filter.thresh)
### Filter the phase 3 results to only include rule sets for which every rule is assigned to a minimum percentage of samples, default is 3%
tpl.filtered <- evaluateListOfIMs(D,Q,rm.ordered,til.filtered) 
### Get top performance list (TPL), which contains the objective function score of all of the rule sets in til.filtered

Extract best rule sets and core rule set

best.rs.list <- getBestRsList(rm.ordered,tpl.filtered,til.filtered)
### This is a list of the best rule sets for all K

core.K <- getCoreK(D,rm.ordered,tpl.filtered,til.filtered) # Determine core K
core.ruleset <- getCoreRS(D,rm.ordered,tpl.filtered,til.filtered) # Extract core rule set
print(core.ruleset)
#>                      r2                      r1                      r3 
#>      "CDKN2A-MD.NRAS-M"      "BRAF-M.CDKN2A-MD"        "BRAF-M.PTEN-MD" 
#>                     r29                     r13                     r32 
#>         "NRAS-M.TP53-M" "BRAF-M.HIPK2/TBXAS1-A"    "BRAF-M.RN7SKP254-A"

Make generalized core results

list.subset.cores <- makeSubCoreList(D,Q,rm.ordered,til.filtered,num.gc.iter,num.gc.eval)
#> [1] "Subset Core Iteration = 1"
#> [1] "Subset Core Iteration = 2"
#> [1] "Subset Core Iteration = 3"
#> [1] "Subset Core Iteration = 4"
#> [1] "Subset Core Iteration = 5"
#> [1] "Subset Core Iteration = 6"
#> [1] "Subset Core Iteration = 7"
#> [1] "Subset Core Iteration = 8"
#> [1] "Subset Core Iteration = 9"
#> [1] "Subset Core Iteration = 10"
#> Time difference of 8.281652 secs
### list.subset.cores is a list of core rule set derived from subsampling iterations

gcr.df <- getGCRs(list.subset.cores) # Generalized core rules
print(gcr.df)
#>                      GCR Confidence Confidence.Level
#> 10      CDKN2A-MD.NRAS-M        100             High
#> 9         BRAF-M.PTEN-MD        100             High
#> 8       BRAF-M.CDKN2A-MD        100             High
#> 7          NRAS-M.TP53-M         80             High
#> 6  BRAF-M.HIPK2/TBXAS1-A         60           Medium
#> 5     BRAF-M.RN7SKP254-A         30              Low
#> 4         ARID2-M.NRAS-M         30              Low
#> 3          BRAF-M.TP53-M         20              Low
#> 2         NRAS-M.SMYD3-A         10              Low
#> 1          HULC-A.NRAS-M         10              Low
gcd.df <- getGCDs(list.subset.cores) # Generalized core duos
print(gcd.df)
#>                      GCD Confidence Confidence.Level
#> 10      CDKN2A-MD.NRAS-M        100             High
#> 9         BRAF-M.PTEN-MD        100             High
#> 8       BRAF-M.CDKN2A-MD        100             High
#> 7          NRAS-M.TP53-M         80             High
#> 6  BRAF-M.HIPK2/TBXAS1-A         60           Medium
#> 5     BRAF-M.RN7SKP254-A         30              Low
#> 4         ARID2-M.NRAS-M         30              Low
#> 3          BRAF-M.TP53-M         20              Low
#> 2         NRAS-M.SMYD3-A         10              Low
#> 1          HULC-A.NRAS-M         10              Low
gce.df <- getGCEs(list.subset.cores) # Generalized core events
print(gce.df)
#>               GCE Confidence Confidence.Level
#> 10        PTEN-MD        100             High
#> 9          NRAS-M        100             High
#> 8       CDKN2A-MD        100             High
#> 7          BRAF-M        100             High
#> 6          TP53-M         90             High
#> 5  HIPK2/TBXAS1-A         60           Medium
#> 4     RN7SKP254-A         30              Low
#> 3         ARID2-M         30              Low
#> 2         SMYD3-A         10              Low
#> 1          HULC-A         10              Low