---
title: "GSCUSUM charts"
author: "Lena Hubig"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{GSCUSUM charts}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.dim = c(7,5)
)

library(cusum)
library(ggplot2)

```


## Overview

This vignette describes how to use GSCUSUM charts, an extension to standard CUSUM charts for binary performance data grouped in samples of unequal size. 

## Data preparation
Following information have to be available:

* patient-individual outcomes
* block-identifier in continuous sequence (can be obtain for example with `dplyr::group_indices()`)
* (patient-individual risk scores / risk of adverse event/failure)

These information are collected in a numeric matrix.

Non-risk-adjusted example data:
```{r}
head(gscusum_example_data)



```

Risk-adjusted example data:
```{r}
head(ragscusum_example_data)

```

## Non-risk-adjusted GSCUSUM chart

Like in the standard CUSUM chart (see vignette for CUSUM charts), parameters have to be estimated in order to set up the charts,  

```{r}
failure_probability <- mean(gscusum_example_data$y[gscusum_example_data$year == 2016])

n_patients <- nrow(gscusum_example_data[gscusum_example_data$year == 2016,])
```

and control limits have to be estimated:

```{r}

cusum_limit <- cusum_limit_sim(failure_probability,
                            n_patients,
                            odds_multiplier = 2,
                            n_simulation = 1000,
                            alpha = 0.05,
                            seed = 2046)


print(cusum_limit)
```

GSCUSUM charts are constructed on performance data from 2017.
```{r}

gscusum_data <- gscusum_example_data[gscusum_example_data$year == 2017,]

input_outcomes <- matrix(c(gscusum_data$y, gscusum_data$block_identifier), ncol = 2)


gcs <- gscusum(input_outcomes = input_outcomes,
              failure_probability = failure_probability,
              odds_multiplier = 2,
              limit = cusum_limit,
              max_num_shuffles = 1000,
              quantiles = c(0.,0.05,0.25,0.5,0.75,.95,1))
```


This function returns the signal probability, average CUSUM values and quantiles of the CUSUM distribution specified in the function call. 
```{r}
gcs <- as.data.frame(gcs)
names(gcs) <- c("sig_prob", "avg", "min", "q05", "q25", "median","q75","q95","max")
head(gcs)
```

```{r}
gcs$block_identifier <- input_outcomes[,2]
gcs$t <- seq(1,nrow(gcs))

col1 <- "#f7ba02"
col2 <- "#4063bc"
palette <- rep(c(col1, col2), 300)

ggplot() +
  geom_line(data = gcs, aes(x = t, y = sig_prob)) +
  geom_point(data = gcs, aes(x = t, y = sig_prob, col = as.factor(block_identifier) )) +
  scale_color_manual(guide=FALSE, values = palette) +
  scale_y_continuous(name = "Signal Probability", limits = c(0,1))+
  theme_bw()
```

The complete run can be plotted with:
```{r}
nblock <- max(gcs$block_identifier)

p <- ggplot(gcs)

for ( i in 1: nblock){
  dblock <- gcs[gcs$block_identifier == i,]
  col <- ifelse(i %% 2 == 0,col2,col1)
  dblock_before <- dblock[1,]
  dblock_before$t <- dblock_before$t - .5
  dblock_after <- dblock[nrow(dblock),]
  dblock_after$t <- dblock_after$t + .5
  dblock_n <- rbind(dblock, dblock_before, dblock_after)

  p <- p +
    geom_ribbon(data = dblock_n, aes(x = t, ymin = min, ymax = max), fill = col, alpha = 0.2) +
    geom_ribbon(data = dblock_n, aes(x = t, ymin = q05, ymax = q95), fill = col, alpha = 0.2) +
    geom_ribbon(data = dblock_n, aes(x = t, ymin = q25, ymax = q75), fill = col, alpha = 0.2)

}

p <- p +
  geom_line(data = gcs, aes(x = t, y = median)) +
  geom_point(data = gcs, aes( x = t, y = median, fill = as.factor(block_identifier)), size=2, pch = 21)+
  geom_hline(aes(yintercept = cusum_limit), linetype = 2) +
  theme_bw() +
  scale_y_continuous(name = "CUSUM Distribution") +
  scale_x_continuous(name = "Sequence of Observations") +
  scale_fill_manual(values = palette, guide = FALSE) +
  labs(subtitle = "GSCUSUM")
p
```



## Risk-adjusted GSCUSUM chart

Like in the standard RA-CUSUM chart (see vignette for CUSUM charts), parameters are estimated in order to set up the charts,  

```{r}
n_patients <- nrow(ragscusum_example_data[ragscusum_example_data$year == 2016,])
```

and control limits are set:

```{r}

racusum_limit <- racusum_limit_sim(patient_risks = ragscusum_example_data$score[ragscusum_example_data$year == 2016],
                            odds_multiplier = 2,
                            n_simulation = 1000,
                            alpha = 0.05,
                            seed = 2046)


print(racusum_limit)
```

GSCUSUM charts are constructed on performance data from 2017.
```{r}

ragscusum_data <- ragscusum_example_data[ragscusum_example_data$year == 2017,]

input_outcomes <- matrix(c(gscusum_data$y, gscusum_data$block_identifier), ncol = 2)


gcs <- gscusum(input_outcomes = input_outcomes,
              failure_probability = failure_probability,
              odds_multiplier = 2,
              limit = cusum_limit,
              max_num_shuffles = 1000,
              quantiles = c(0.,0.05,0.25,0.5,0.75,.95,1))
```


This function returns the signal probability, average CUSUM values and quantiles of the CUSUM distribution specified in the function call. 
```{r}
gcs <- as.data.frame(gcs)
names(gcs) <- c("sig_prob", "avg", "min", "q05", "q25", "median","q75","q95","max")
head(gcs)
```

```{r}
gcs$block_identifier <- input_outcomes[,2]
gcs$t <- seq(1,nrow(gcs))

col1 <- "#f7ba02"
col2 <- "#4063bc"
palette <- rep(c(col1, col2), 300)

ggplot() +
  geom_line(data = gcs, aes(x = t, y = sig_prob)) +
  geom_point(data = gcs, aes(x = t, y = sig_prob, col = as.factor(block_identifier) )) +
  scale_color_manual(guide=FALSE, values = palette) +
  scale_y_continuous(name = "Signal Probability", limit = c(0,1))+
  theme_bw()
```

The complete run can be plotted with:
```{r}
nblock <- max(gcs$block_identifier)

p <- ggplot(gcs)

for ( i in 1: nblock){
  dblock <- gcs[gcs$block_identifier == i,]
  col <- ifelse(i %% 2 == 0,col2,col1)
  dblock_before <- dblock[1,]
  dblock_before$t <- dblock_before$t - .5
  dblock_after <- dblock[nrow(dblock),]
  dblock_after$t <- dblock_after$t + .5
  dblock_n <- rbind(dblock, dblock_before, dblock_after)

  p <- p +
    geom_ribbon(data = dblock_n, aes(x = t, ymin = min, ymax = max), fill = col, alpha = 0.2) +
    geom_ribbon(data = dblock_n, aes(x = t, ymin = q05, ymax = q95), fill = col, alpha = 0.2) +
    geom_ribbon(data = dblock_n, aes(x = t, ymin = q25, ymax = q75), fill = col, alpha = 0.2)

}

p <- p +
  geom_line(data = gcs, aes(x = t, y = median)) +
  geom_point(data = gcs, aes( x = t, y = median, fill = as.factor(block_identifier)), size=2, pch = 21)+
  geom_hline(aes(yintercept = cusum_limit), linetype = 2) +
  theme_bw() +
  scale_y_continuous(name = "RACUSUM Distribution") +
  scale_x_continuous(name = "Sequence of Observations") +
  scale_fill_manual(values = palette, guide = FALSE) +
  labs(subtitle = "RA-GSCUSUM")
p
```

