-
Notifications
You must be signed in to change notification settings - Fork 0
/
makeTableWeights.R
163 lines (114 loc) · 4.15 KB
/
makeTableWeights.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
##### As it says, this obtains a table for genotypes' weights-
## Launch as
## nohup R --vanilla --slave -f makeTableWeigths.R &> makeTableWeights.Rout &
rm(list = ls())
source("oncoFunctions.R")
library(codetools)
checkUsageEnv(env = .GlobalEnv)
## library(dplyr)
## library(parallel)
## library(OncoSimulR)
## Change paths as needed
dirsSims <- c("/Disk2/ramon/No-Backuppc/selected-simulations-A",
"/Disk2/ramon/No-Backuppc/selected-simulations-B")
dirsSampled <- "/home/ramon/No-Backuppc/cpm-analyzed-all-selected"
## ## For playing
## dirsSampled <- "~/tmp"
## dirsSims <- "~/tmp"
## filesSim give the "true probabilities" of genotypes
## filesSample4000 are used to obtain the probabilities of genotypes under
## the sampling regime (4000 * 5 = 20000, and that is where we get
## the probabilities from)
filesSim <- list.files(dirsSims, pattern="simobj", full.names = TRUE)
filesSampled <- list.files(dirsSampled,
pattern = glob2rx("ANALYZED__ID*.rds"),
full.names = TRUE)
filesSampled4000 <- grep("_size_split_4000\\.rds", filesSampled,
value = TRUE)
stopifnot(length(filesSim) == 1260)
stopifnot(length(filesSampled4000) == (1260 * 3))
date()
## Warm up compiler and check
null <- dplyr::bind_rows(
lapply(filesSampled4000[1:5],
freqs_from_sampling_full))
null
null <- dplyr::bind_rows(
lapply(filesSim[1:5],
true_Freqs_full_2))
null
rm(null)
gc()
#### Sampled is much faster
## Do for real
pboptions(type = "txt")
cat("\n Doing sampled \n")
outFreqsSampled <- pbmclapply(filesSampled4000,
freqs_from_sampling_full,
mc.cores = detectCores())
save(file = "outFreqsSampled.RData", outFreqsSampled,
compress = FALSE)
date()
outFreqsSampled <- dplyr::bind_rows(outFreqsSampled)
save(file = "outFreqsSampled.RData", outFreqsSampled,
compress = FALSE)
date()
#### Simulations
date()
## Do for real
cat("\n Doing simulations \n")
## This is very slow. I think it is I/O bound (load average relative to
## %CPU, R processes waiting, etc, and files leave in an HDD, not the
## SSD). If rerunning, it could make sense to run this with lapply
## but parallelize inside true_Freqs. Place also timing sentinels
## in true_Freq_nulls.
## I tried it, with little improvement with the _2 versions
## in limited testing.
outFreqsTrue <- pbmclapply(filesSim,
true_Freqs_full_2,
mc.cores = 10) ## detectCores())
save(file = "outFreqsTrue.RData", outFreqsTrue,
compress = FALSE)
## I was getting out of RAM errors
any_error <- unlist(lapply(outFreqsTrue, function(x) inherits(x, "try-error")))
if(sum(any_error)) {
print(outFreqsTrue[any_error])
sum(any_error)
}
date()
outFreqsTrue <- dplyr::bind_rows(outFreqsTrue)
save(file = "outFreqsTrue.RData", outFreqsTrue,
compress = FALSE)
gc()
date()
## BEWARE here!!
## Had we done this
## weightsAll <- dplyr::full_join(outFreqsSampled,
## outFreqsTrue)
## in weightsAll, some genotypes, that appear in the "true"
## never appeared in one or more of the three detect. But they
## could been in a POM. Fix that.
## How can we fix that? Easy. Create a dummies and replicate.
## The True is the same in all sampling regimes
## and we ensure those genotypes always present as they should
outFreqsTrueL <- outFreqsTrueS <- outFreqsTrueU <- outFreqsTrue
outFreqsTrueL$detect <- "large"
outFreqsTrueS$detect <- "small"
outFreqsTrueU$detect <- "unif"
outFreqsTrue3 <- rbind(outFreqsTrueL,
outFreqsTrueS,
outFreqsTrueU)
nrow(outFreqsSampled)
nrow(outFreqsTrue)
nrow(outFreqsTrue3)
weightsAll <- dplyr::full_join(outFreqsSampled,
outFreqsTrue3)
nrow(weightsAll)
## Remember: NAs mean some genotypes not present
## We expect no missing values in TrueFreq and TrueProp
## except for the genotypes with the splits. But none here
## So no missing there
summary(weightsAll)
save(file = "weightsAll.RData", weightsAll,
compress = FALSE)
date()