-
Notifications
You must be signed in to change notification settings - Fork 0
/
data-retrieval.R
60 lines (49 loc) · 2.22 KB
/
data-retrieval.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
library("curatedMetagenomicData")
library("dplyr")
# ====== Retrieve metadata & counts ======
crc_meta <- sampleMetadata[sampleMetadata$study_condition%in%c("CRC"),]
select(crc_meta, c(study_name, sequencing_platform))
crc_studies <- c('GuptaA_2019', 'ThomasAM_2018b')
# filter stusies and return TreeSummarizedExperiment
tse <- sampleMetadata |>
#filter(disease == 'healthy') |>
filter(study_name%in%crc_studies) |>
filter(age_category == 'adult') |>
filter(body_site == 'stool') |>
#filter(pregnant != 'yes') |>
#filter(lactating != 'yes') |>
returnSamples("relative_abundance", rownames = "NCBI")
# obtain the data as dfs
counts <- as.data.frame(assays(tse)[[1]])
meta <- data.frame(colData(tse))
# ====== Balance the dataset ======
# ommitted the step to retrieve all the data for two studies
# Find the counts per study name
study_counts <- table(meta$study_name)
# Determine the minimum count of samples across all study names
## either select the minimum of study counts
#min_count <- min(study_counts)
## or a constant
#min_count <- 10
# Randomly sample that number of samples from each study name
#sampled_data <- lapply(names(study_counts), function(study) {
# subset(meta, study_name == study)[sample(1:nrow(subset(meta, study_name == study)), min_count), ]
#})
# Combine sampled datasets into a single balanced dataset
#balanced_meta <- do.call(rbind, sampled_data)
#table(balanced_meta$study_name)
# filter the counts
#counts <- counts[, rownames(balanced_meta)]
# to check the number of unique values in each column
#sapply(meta, function(x) length(unique(x)))
write.csv(meta, "data/otutable_metadata.csv")
write.csv(counts, "data/otutable.csv")
# ====== Table 2: Studies ======
table2 <- meta %>%
group_by('Назва дослідження' = study_name) %>%
summarise('PMID' = first(PMID),
'Платформа секвенування' = toString(unique(sequencing_platform)),
'Набор для екстракції ДНК' = toString(unique(DNA_extraction_kit)),
'Здорових зразків' = sum(disease == 'healthy'),
'Ракових зразків' = sum(disease == 'CRC'))
write.csv(table2, "docs/table2.csv")