sharing_test.Rmd

---
title: "Untitled"
output:
  pdf_document: default
  html_document: default
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(ggplot2)
library(forcats)
library(broom)
library(ggbeeswarm)
library(data.table)
library(reshape2)
library(ggridges)
#!!! How to install brms, first we install 'V8' then the package itself
#Sys.setenv(DOWNLOAD_STATIC_LIBV8 = 1)
#install.packages(c("V8", "rstan", "brms"))
library(brms)
library(rstan)

rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

get_os <- function(){
  sysinf <- Sys.info()
  if (!is.null(sysinf)){
  os <- sysinf['sysname']
  if (os == 'Darwin')
    os <- "osx"
  } else { ## mystery machine
    os <- .Platform$OS.type
    if (grepl("^darwin", R.version$os))
      os <- "osx"
    if (grepl("linux-gnu", R.version$os))
      os <- "linux"
  }
  tolower(os)
}
if (get_os() == "osx") {
  read_gz <- function(x) fread(paste("gzcat", x))
} else {
  read_gz <- function(x) fread(paste("zcat", x))
}

rename <- dplyr::rename
select <- dplyr::select
```

## Load data

Load datasets and mark DLI patients. Check number of clonotypes in donors and receptients - we have engough clones for statistics everywhere. We group clones by their abundance in donors: 1 (singletons), 2 (doubletons), 3 (tripletons, as good as doubletons, no need to hate them Vanya) and 4+ reads (Large). The choice is dictated by observing the fact that for rare events Poisson distribution shows huge difference in capture probability for $\lambda \in [1,3]$ while smaller $\lambda$ values are unlikely to be encountered and quantified. Moreover, for large clones, each hyperexpanded variant has its own history and, likely, its own dynamic, so binning them to different bins based on minor differences in frequency (e.g. $0.1\%$ vs $0.01\%$) makes little sense.

```{r}
data <- list.files("data", full.names = T) %>%
  as.list %>%
  lapply(function(x) read_gz(x) %>% mutate(sample.id = x)) %>%
  rbindlist %>%
  mutate(sample.id.old = sample.id,
         dli = !str_starts(sample.id.old, fixed("data/sh.p")),
         sample.id = paste0("D", sample.id %>% as.factor %>% as.integer, ifelse(dli, "*", ""))) #%>%
  #filter(dli)

data %>%
  select(sample.id.old, dli, sample.id) %>%
  unique

data <- data %>%
  mutate(donor.quantile = case_when(
    is.na(cloneCount.don) ~ "Missing",
    cloneCount.don == 1 ~ "Singleton",
    cloneCount.don == 2 ~ "Doubleton",
    cloneCount.don == 3 ~ "Tripleton",
    T ~ "Large"
    ))

data %>%
  mutate(cloneCount.don = ifelse(is.na(cloneCount.don), 0, cloneCount.don)) %>%
  ggplot(aes(x = sample.id, 
             fill = donor.quantile %>% 
               fct_reorder(cloneCount.don))) +
  geom_bar() +
  scale_fill_brewer("Size", palette = "Spectral") +
  theme_classic() 

data %>%
  filter(donor.quantile != "Missing") %>%
  ggplot(aes(x = donor.quantile, 
             fill = sample.id)) +
  geom_bar(position = "dodge") +
  scale_y_log10() +
  geom_hline(yintercept = 100, linetype = "dotted") +
  geom_hline(yintercept = 1000, linetype = "dashed") +
  theme_classic() +
  theme(legend.position = "bottom")
```

## Modeling probability of "survival" for clones using Beta distribution

We split our donor dataset into singletons, doubletons, tripletons and higher-order clonotypes. Each of these subsets contains enough clones to reliably estimate the probability of recapturing a clonotype from a given subset of donor clonotypes. Interestingly, the ration between recapturing probabilities of singletons, doubletons and tripletons is in line with exponential difference stemming from Poisson distribution.

```{r}
# summarize & estimate parameters of beta distribution
alpha.prior <- 1
beta.prior <- 1
data.s <- data %>%
  filter(donor.quantile != "Missing") %>%
  group_by(sample.id) %>%
  mutate(total.don = sum(cloneCount.don, na.rm = T),
         clones.don = length(unique(aaSeqCDR3.don)), # note here we count +1 for NA, maybe should modify it
         total.rec = sum(cloneCount.rec, na.rm = T),
         clones.rec = length(unique(aaSeqCDR3.rec))) %>%
  group_by(sample.id, donor.quantile) %>%
  mutate(clones.don.quant = length(unique(aaSeqCDR3.don))) %>%
  group_by(dli, sample.id, donor.quantile, total.don, clones.don, total.rec, clones.rec, clones.don.quant) %>%
  summarize(alpha = sum(!is.na(cloneCount.rec)) + alpha.prior,
            beta = sum(is.na(cloneCount.rec)) + beta.prior) %>%
  ungroup

data.sp <- data.s %>%
  merge(tibble(p = c(0:1000/1000, 10^(-4000:-1000/1000)))) %>%
  group_by(sample.id, donor.quantile) %>%
  mutate(Pbeta = dbeta(p, alpha, beta)) %>%
  ungroup

data.sp %>%
  mutate(dli = ifelse(dli, "DLI", "non-DLI")) %>%
  group_by(sample.id) %>%
  mutate(height = Pbeta / max(Pbeta)) %>%
  ggplot(aes(x = p, y = sample.id, height = height, 
             fill = factor(donor.quantile, levels = c("Singleton",
                                                      "Doubleton",
                                                      "Tripleton",
                                                      "Large"))
  )) +
  geom_ridgeline(color = NA, alpha = 0.9) + 
  scale_x_log10("Capture probability", limits = c(0.8e-4, 1e-1)) + ylab("Pbeta") +
  scale_fill_brewer("", palette = "Spectral") +
  facet_wrap(~dli, scales = "free_y") +
  theme_classic() +
  theme(aspect = 1, legend.position = "bottom")
```

Interestingly, the TCR recovery rate is related both to the total number of clones in donor and recipient. It is also different for DLI and non-DLI patients.

```{r fig.width=8, fig.height=8}
data.s %>%
  mutate(dli = ifelse(dli, "DLI", "non-DLI")) %>%
  ggplot(aes(x = clones.rec / clones.don, y = alpha / (alpha + beta), 
             group = paste(dli, donor.quantile), 
             linetype = dli, shape = dli,
             color = factor(donor.quantile, levels = c("Singleton",
                                                       "Doubleton",
                                                       "Tripleton",
                                                       "Large")))) +
  geom_smooth(method = "lm", aes(), size = 1) +
  geom_point(size = 5) +
  geom_text(aes(label = sample.id), size = 2, color = "white") +
  scale_x_log10("Clones in receptient / clones in donor") + 
  scale_y_log10("Capture probability") +
  scale_color_brewer("", palette = "Spectral") +
  theme_classic() +
  theme(aspect = 1, legend.position = "bottom")
```

## Basic linear modelling

Quantifying the effect of various factors -- number of clones detected in donor, number of clones detected in receptient and the frequency quantile of a given clonotype in donor -- on the recapture probability. Log-transformed variables show extremely high correlation.

```{r}
data.coord <- data.s %>%
  group_by(donor.quantile) %>%
  mutate(logRecaptureProb = log(alpha / (alpha + beta)), 
         logClonesRecepient = log(clones.rec),
         logClonesDonor = log(clones.don)) %>%
  ungroup %>%
  mutate(donor.quantile = factor(donor.quantile, levels = c("Singleton",
                                                       "Doubleton",
                                                       "Tripleton",
                                                       "Large")))
```

Show coefficients of linear model

```{r}
data.coord %>%
  ungroup %>%
  mutate(donor.quantile = as.factor(donor.quantile)) %>%
  do(lm(.$logRecaptureProb ~ .$donor.quantile + .$dli + .$logClonesRecepient + .$logClonesDonor) %>% tidy)
```

Show variance explained (ANOVA)

```{r}
data.coord %>%
  ungroup %>%
  mutate(donor.quantile = as.factor(donor.quantile)) %>%
  do(lm(.$logRecaptureProb ~ .$donor.quantile + .$dli + .$logClonesRecepient + .$logClonesDonor) %>% aov %>% tidy) %>%
  mutate(var.explained.pct = sumsq / sum(sumsq) * 100) 
```

Origin of clones found in recepient: number of highly expanded clones that originated from expanded donor clones and rare donor clones varies and depends on donor. In general clonotypes preserve their size, but there is lots of noise here. 

> Open question - show this statistically, that survival prob depends not just on sampling, but is more skewed and depends on clonotype size.

```{r fig.width=8, fig.height=8}
data %>%
  mutate(donor.quantile = factor(donor.quantile, levels = c("Singleton",
                                                       "Doubleton",
                                                       "Tripleton",
                                                       "Large"))) %>%
  filter(dli & !is.na(cloneCount.rec)) %>%
  group_by(sample.id) %>%
  mutate(rank = rank(-cloneCount.rec, ties.method	= "first"),
         freq.rec = cloneCount.rec / sum(cloneCount.rec)) %>%
  filter(donor.quantile != "Missing") %>%
  ggplot(aes(x = donor.quantile, y = rank)) +
  geom_hline(yintercept = 100, linetype = "dashed") +
  geom_quasirandom(aes(size = freq.rec, color = donor.quantile)) +
  geom_boxplot(fill = NA, color = "black", outlier.colour = NA) +
  coord_flip() +
  scale_y_log10("Clonotype rank") +
  xlab("") +
  scale_size_continuous("Clonotype size") +
  scale_color_brewer("", palette = "Spectral") +
  facet_wrap(.~sample.id, scales = "free_x") +
  theme_classic() +
  theme(aspect = 1, legend.position = "bottom") +
  ggtitle("DLI")

data %>%
  mutate(donor.quantile = factor(donor.quantile, levels = c("Singleton",
                                                       "Doubleton",
                                                       "Tripleton",
                                                       "Large"))) %>%
  filter(!dli & !is.na(cloneCount.rec)) %>%
  group_by(sample.id) %>%
  mutate(rank = rank(-cloneCount.rec, ties.method	= "first"),
         freq.rec = cloneCount.rec / sum(cloneCount.rec)) %>%
  filter(donor.quantile != "Missing") %>%
  ggplot(aes(x = donor.quantile, y = rank)) +
  geom_hline(yintercept = 100, linetype = "dashed") +
  geom_quasirandom(aes(size = freq.rec, color = donor.quantile)) +
  geom_boxplot(fill = NA, color = "black", outlier.colour = NA) +
  coord_flip() +
  scale_y_log10("Clonotype rank") +
  xlab("") +
  scale_size_continuous("Clonotype size") +
  scale_color_brewer("", palette = "Spectral") +
  facet_wrap(.~sample.id, scales = "free_x") +
  theme_classic() +
  theme(aspect = 1, legend.position = "bottom") +
  ggtitle("Non-DLI")
```

## Modeling data and covariate analysis examples

Here is an example on how we can correct for sampling probability based on sample diversities and clonotype size and compare between DLI and non-DLI donors. Here we recompute a single $\Delta p = p_{observed} - p_{predicted}$ for every sample, i.e. if donor sample contains (by percent) $\phi_1$ singletons, $\phi_2$ doubletons, etc and difference for singletons is $\Delta p_{i}$ - we compute a weighted sum $\Delta p = \sum_i \phi_i \Delta p_i$.

Actually works not that great, however, if we treat $\Delta p_{i}$ separately for each sample we are artificially boosting the number of "samples" for statistical testing. An alternative would be to look separately at singletons, doubletons, etc (I think multiple testing can be omitted here as we do like 2-3 tests at most).

This is just an example of what can be done "manually" without using proper statistical methods like building several models, estimating P-values for various covariates and comparing models using ANOVA, etc.

```{r fig.width=3, fig.height=4}
data.dli.pred <- data.coord %>% filter(dli)
lm(logRecaptureProb ~ donor.quantile + logClonesRecepient + logClonesDonor, 
   data = data.dli.pred) -> lm.dli
data.dli.pred$logRecaptureProbPred <- predict(lm.dli, data.dli.pred)

data.ndli.pred <- data.coord %>% filter(!dli)
lm(logRecaptureProb ~ donor.quantile + logClonesRecepient + logClonesDonor, 
   data = data.ndli.pred) -> lm.ndli
data.ndli.pred$logRecaptureProbPred <- predict(lm.ndli, data.ndli.pred)

rbind(data.dli.pred, 
      data.ndli.pred) %>%
  group_by(sample.id) %>%
  mutate(clones.don.quant.frac = clones.don.quant / sum(clones.don.quant),
         delta = logRecaptureProb - logRecaptureProbPred) %>%
  group_by(sample.id, dli) %>%
  summarise(diff = sum(delta * clones.don.quant.frac), 
            diff.unweighted = mean(delta),
            diff.singl = sum(delta * (donor.quantile == "Singleton"))) -> data.delta.summ

data.delta.summ %>%
  mutate(dli = ifelse(dli, "DLI", "non-DLI")) %>%
  ggplot(aes(x = dli, y = diff, color = dli)) +
  geom_quasirandom(width = 0.1) +
  geom_boxplot(width = 0.2, color = "black", fill = NA, outlier.colour = NA) +
  scale_color_brewer(guide = F, palette = "Set1") +
  xlab("") + ylab("Cumulative difference in recapture prob.") +
  ggtitle("Weighted by fraction of singletons, doubletons, etc") +
  theme_classic()

data.delta.summ %>%
  mutate(dli = ifelse(dli, "DLI", "non-DLI")) %>%
  ggplot(aes(x = dli, y = diff.unweighted, color = dli)) +
  geom_quasirandom(width = 0.1) +
  geom_boxplot(width = 0.2, color = "black", fill = NA, outlier.colour = NA) +
  scale_color_brewer(guide = F, palette = "Set1") +
  xlab("") + ylab("Cumulative difference in recapture prob.") +
  ggtitle("Not weighted by fraction of singletons, doubletons, etc") +
  theme_classic()

data.delta.summ %>%
  mutate(dli = ifelse(dli, "DLI", "non-DLI")) %>%
  ggplot(aes(x = dli, y = diff.singl, color = dli)) +
  geom_quasirandom(width = 0.1) +
  geom_boxplot(width = 0.2, color = "black", fill = NA, outlier.colour = NA) +
  scale_color_brewer(guide = F, palette = "Set1") +
  xlab("") + ylab("Difference in recapture prob.") +
  ggtitle("Singletons only") +
  theme_classic()
```

Now lets build some [linear models using `lm`](https://ucdavis-bioinformatics-training.github.io/2019-March-Bioinformatics-Prerequisites/thursday/linear_models.html) and compare them using ANOVA

```{r}
mdl.1 <- lm(logRecaptureProb ~ donor.quantile + logClonesRecepient + logClonesDonor,
            data = data.coord)
mdl.1

mdl.2 <- lm(logRecaptureProb ~ donor.quantile + logClonesRecepient + logClonesDonor + dli,
            data = data.coord)
mdl.2
```

Summary of model with DLI covariate

```{r}
summary(mdl.1)
plot(mdl.1)
```

And model without one

```{r}
summary(mdl.2)
plot(mdl.2)
```

Compare two models with ANOVA, the second one has smaller residual sum of squares

```{r}
anova(mdl.1, mdl.2)
```

Note that we can also do modelling using generalized linear models, [GLMs](https://www.guru99.com/r-generalized-linear-model.html). Here we remove log-transform as the we use binomial distribution

```{r}
mdl.glm <- glm(found ~ donor.quantile + clones.don + clones.rec + dli,
               family = "binomial",
               data = data %>%
                 filter(donor.quantile != "Missing") %>%
                 mutate(found = ifelse(is.na(cloneCount.rec), 0, 1)) %>%
                 merge(data.coord %>% 
                         select(sample.id, clones.don, clones.rec, dli) %>%
                         unique))

summary(mdl.glm)
```

Bayes modelling

```{r}
brm(logRecaptureProb ~ donor.quantile + logClonesRecepient + logClonesDonor,
               data = data.coord)
```

Bayesian modelling (can also use smth like http://mjskay.github.io/tidybayes/articles/tidy-brms.html). Note that no complex models here (e.g. with dependence on parameters)

```{r}
mdl.b.1 <- brm(logRecaptureProb ~ donor.quantile + logClonesRecepient + logClonesDonor,
               data = data.coord)

mdl.b.2 <- brm(logRecaptureProb ~ donor.quantile + logClonesRecepient + logClonesDonor + dli,
               data = data.coord)
```

Plot models

```{r}
summary(mdl.b.1)
summary(mdl.b.2)

plot(mdl.b.1, ask = F)
plot(mdl.b.2, ask = F)

LOO(mdl.b.1, mdl.b.2)
```