From 614b8f931929e571958c87db6c0bd2738a86ac13 Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Thu, 18 Jul 2024 13:04:38 +0530 Subject: [PATCH 01/14] Shift functions to check for missing files Return from convert_input via a helper function Update corresponding test files and add tests to ensure do_conversions isn't affected by current applied changes Signed-off-by: Abhinav Pandey --- base/db/R/add.database.entries.R | 111 +++++++++++++ base/db/R/check.missing.files.R | 49 ++++++ base/db/R/convert_input.R | 150 ++---------------- base/db/man/add.database.entries.Rd | 70 ++++++++ base/db/man/check_missing_files.Rd | 31 ++++ .../tests/testthat/test.check.missing.files.R | 24 +++ base/db/tests/testthat/test.convert_input.R | 29 ++-- 7 files changed, 320 insertions(+), 144 deletions(-) create mode 100644 base/db/R/add.database.entries.R create mode 100644 base/db/R/check.missing.files.R create mode 100644 base/db/man/add.database.entries.Rd create mode 100644 base/db/man/check_missing_files.Rd create mode 100644 base/db/tests/testthat/test.check.missing.files.R diff --git a/base/db/R/add.database.entries.R b/base/db/R/add.database.entries.R new file mode 100644 index 00000000000..3c253c07e73 --- /dev/null +++ b/base/db/R/add.database.entries.R @@ -0,0 +1,111 @@ +#' Return new arrangement of database while adding code to deal with ensembles +#' +#' @param result list of results from the download function +#' @param con database connection +#' @param start_date start date of the data +#' @param end_date end date of the data +#' @param write whether to write to the database +#' @param overwrite Logical: If a file already exists, create a fresh copy? +#' @param insert.new.file whether to insert a new file +#' @param input.args input arguments obtained from the convert_input function +#' @param machine machine information +#' @param mimetype data product specific file format +#' @param formatname format name of the data +#' @param allow.conflicting.dates whether to allow conflicting dates +#' @param ensemble ensemble id +#' @param ensemble_name ensemble name +#' @param existing.input existing input records +#' @param existing.dbfile existing dbfile records +#' @param input input records +#' @return list of input and dbfile ids +#' +#' @author Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko + +add.database.entries <- function( + result, con, start_date, + end_date, write, overwrite, + insert.new.file, input.args, + machine, mimetype, formatname, + allow.conflicting.dates, ensemble, + ensemble_name, existing.input, + existing.dbfile, input) { + if (write) { + # Setup newinput. This list will contain two variables: a vector of input IDs and a vector of DB IDs for each entry in result. + # This list will be returned. + newinput <- list(input.id = NULL, dbfile.id = NULL) # Blank vectors are null. + for (i in 1:length(result)) { # Master for loop + id_not_added <- TRUE + + if (!is.null(existing.input) && nrow(existing.input[[i]]) > 0 && + (existing.input[[i]]$start_date != start_date || existing.input[[i]]$end_date != end_date)) { + # Updating record with new dates + db.query(paste0("UPDATE inputs SET start_date='", start_date, "', end_date='", end_date, "' WHERE id=", existing.input[[i]]$id), con) + id_not_added <- FALSE + + # The overall structure of this loop has been set up so that exactly one input.id and one dbfile.id will be written to newinput every iteration. + newinput$input.id <- c(newinput$input.id, existing.input[[i]]$id) + newinput$dbfile.id <- c(newinput$dbfile.id, existing.dbfile[[i]]$id) + } + + if (overwrite) { + # A bit hacky, but need to make sure that all fields are updated to expected values (i.e., what they'd be if convert_input was creating a new record) + if (!is.null(existing.input) && nrow(existing.input[[i]]) > 0) { + db.query(paste0("UPDATE inputs SET name='", basename(dirname(result[[i]]$file[1])), "' WHERE id=", existing.input[[i]]$id), con) + } + + if (!is.null(existing.dbfile) && nrow(existing.dbfile[[i]]) > 0) { + db.query(paste0("UPDATE dbfiles SET file_path='", dirname(result[[i]]$file[1]), "', file_name='", result[[i]]$dbfile.name[1], "' WHERE id=", existing.dbfile[[i]]$id), con) + } + } + + # If there is no ensemble then for each record there should be one parent + # But when you have ensembles, all of the members have one parent !! + parent.id <- if (is.numeric(ensemble)) { + ifelse(is.null(input[[i]]), NA, input[[1]]$id) + } else { + ifelse(is.null(input[[i]]), NA, input[[i]]$id) + } + + + if ("newsite" %in% names(input.args) && !is.null(input.args[["newsite"]])) { + site.id <- input.args$newsite + } + + if (insert.new.file && id_not_added) { + dbfile.id <- dbfile.insert(in.path = dirname(result[[i]]$file[1]), in.prefix = result[[i]]$dbfile.name[1], "Input", existing.input[[i]]$id, con, reuse = TRUE, hostname = machine$hostname) + newinput$input.id <- c(newinput$input.id, existing.input[[i]]$id) + newinput$dbfile.id <- c(newinput$dbfile.id, dbfile.id) + } else if (id_not_added) { + # This is to tell input.insert if we are writing ensembles + # Why does it need it? Because it checks for inputs with the same time period, site, and machine + # and if it returns something it does not insert anymore, but for ensembles, it needs to bypass this condition + ens.flag <- if (!is.null(ensemble) | is.null(ensemble_name)) TRUE else FALSE + + new_entry <- dbfile.input.insert( + in.path = dirname(result[[i]]$file[1]), + in.prefix = result[[i]]$dbfile.name[1], + siteid = site.id, + startdate = start_date, + enddate = end_date, + mimetype = mimetype, + formatname = formatname, + parentid = parent.id, + con = con, + hostname = machine$hostname, + allow.conflicting.dates = allow.conflicting.dates, + ens = ens.flag + ) + + newinput$input.id <- c(newinput$input.id, new_entry$input.id) + newinput$dbfile.id <- c(newinput$dbfile.id, new_entry$dbfile.id) + } + } # End for loop + + successful <- TRUE + return(newinput) + } else { + PEcAn.logger::logger.warn("Input was not added to the database") + successful <- TRUE + return(NULL) + } +} diff --git a/base/db/R/check.missing.files.R b/base/db/R/check.missing.files.R new file mode 100644 index 00000000000..bde3d7ebe97 --- /dev/null +++ b/base/db/R/check.missing.files.R @@ -0,0 +1,49 @@ +#' Function to check if result has empty or missing files +#' +#' @param result A list of dataframes with file paths +#' @param outname Name of the output file +#' @param existing.input Existing input records +#' @param existing.dbfile Existing dbfile records +#' @return A list of dataframes with file paths, a list of strings with the output file name, a list of existing input records, and a list of existing dbfile records +#' +#' @author Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko + +check_missing_files <- function(result, outname, existing.input = NULL, existing.dbfile = NULL) { + result_sizes <- purrr::map_dfr( + result, + ~ dplyr::mutate( + ., + file_size = purrr::map_dbl(file, file.size), + missing = is.na(file_size), + empty = file_size == 0 + ) + ) + + if (any(result_sizes$missing) || any(result_sizes$empty)) { + log_format_df <- function(df) { + formatted_df <- rbind(colnames(df), format(df)) + formatted_text <- purrr::reduce(formatted_df, paste, sep = " ") + paste(formatted_text, collapse = "\n") + } + + PEcAn.logger::logger.severe( + "Requested Processing produced empty files or Nonexistent files:\n", + log_format_df(result_sizes[, c(1, 8, 9, 10)]), + "\n Table of results printed above.", + wrap = FALSE + ) + } + + # Insert into Database + outlist <- unlist(strsplit(outname, "_")) + + # Wrap in a list for consistant processing later + if (exists("existing.input") && is.data.frame(existing.input)) { + existing.input <- list(existing.input) + } + + if (exists("existing.dbfile") && is.data.frame(existing.dbfile)) { + existing.dbfile <- list(existing.dbfile) + } + return(list(result_sizes, outlist, existing.input, existing.dbfile)) +} diff --git a/base/db/R/convert_input.R b/base/db/R/convert_input.R index 8203fa7244b..d5af069d0eb 100644 --- a/base/db/R/convert_input.R +++ b/base/db/R/convert_input.R @@ -384,7 +384,7 @@ convert_input <- if (!is.null(ensemble) && ensemble) { return.all <-TRUE - }else{ + } else{ return.all <- FALSE } existing.dbfile <- dbfile.input.check(siteid = site.id, @@ -734,143 +734,23 @@ convert_input <- #--------------------------------------------------------------------------------------------------# # Check if result has empty or missing files - result_sizes <- purrr::map_dfr( - result, - ~ dplyr::mutate( - ., - file_size = purrr::map_dbl(file, file.size), - missing = is.na(file_size), - empty = file_size == 0 - ) - ) - - if (any(result_sizes$missing) || any(result_sizes$empty)){ - log_format_df = function(df){ - rbind(colnames(df), format(df)) - purrr::reduce( paste, sep=" ") %>% - paste(collapse="\n") - } - - PEcAn.logger::logger.severe( - "Requested Processing produced empty files or Nonexistant files :\n", - log_format_df(result_sizes[,c(1,8,9,10)]), - "\n Table of results printed above.", - wrap = FALSE) - } - - # Insert into Database - outlist <- unlist(strsplit(outname, "_")) - - # Wrap in a list for consistant processing later - if (exists("existing.input") && is.data.frame(existing.input)) { - existing.input <- list(existing.input) - } - - if (exists("existing.dbfile") && is.data.frame(existing.dbfile)) { - existing.dbfile <- list(existing.dbfile) - } + checked.missing.files <- check_missing_files(result, outname, existing.input, existing.dbfile) + + # Unwrap parameters after performing checks for missing files + result_sizes <- checked.missing.files$result_sizes; + outlist <- checked.missing.files$outlist; + existing.input <- checked.missing.files$existing.input; + existing.dbfile <- checked.missing.files$existing.dbfile; #---------------------------------------------------------------# # New arrangement of database adding code to deal with ensembles. - if (write) { - - # Setup newinput. This list will contain two variables: a vector of input IDs and a vector of DB IDs for each entry in result. - # This list will be returned. - newinput = list(input.id = NULL, dbfile.id = NULL) #Blank vectors are null. - for(i in 1:length(result)) { # Master for loop - id_not_added <- TRUE - - if (exists("existing.input") && nrow(existing.input[[i]]) > 0 && - (existing.input[[i]]$start_date != start_date || existing.input[[i]]$end_date != end_date)) { - - # Updating record with new dates - db.query(paste0("UPDATE inputs SET start_date='", start_date, "', end_date='", - end_date, "' WHERE id=", existing.input[[i]]$id), - con) - id_not_added = FALSE - - # The overall structure of this loop has been set up so that exactly one input.id and one dbfile.id will be written to newinput every interation. - newinput$input.id = c(newinput$input.id, existing.input[[i]]$id) - newinput$dbfile.id = c(newinput$dbfile.id, existing.dbfile[[i]]$id) - } - - if (overwrite) { - # A bit hacky, but need to make sure that all fields are updated to expected - # values (i.e., what they'd be if convert_input was creating a new record) - if (exists("existing.input") && nrow(existing.input[[i]]) > 0) { - db.query(paste0("UPDATE inputs SET name='", basename(dirname(result[[i]]$file[1])), - "' WHERE id=", existing.input[[i]]$id), con) - - } - - if (exists("existing.dbfile") && nrow(existing.dbfile[[i]]) > 0) { - db.query(paste0("UPDATE dbfiles SET file_path='", dirname(result[[i]]$file[1]), - "', ", "file_name='", result[[i]]$dbfile.name[1], - "' WHERE id=", existing.dbfile[[i]]$id), con) - - } - } - - # If there is no ensemble then for each record there should be one parent - #But when you have ensembles, all of the members have one parent !! - if (is.numeric(ensemble)){ - parent.id <- ifelse(is.null(input[i]), NA, input[1]$id) - }else{ - parent.id <- ifelse(is.null(input[i]), NA, input[i]$id) - } - - - - if ("newsite" %in% names(input.args) && !is.null(input.args[["newsite"]])) { - site.id <- input.args$newsite - } - - if (insert.new.file && id_not_added) { - dbfile.id <- dbfile.insert(in.path = dirname(result[[i]]$file[1]), - in.prefix = result[[i]]$dbfile.name[1], - 'Input', existing.input[[i]]$id, - con, reuse=TRUE, hostname = machine$hostname) - newinput$input.id <- c(newinput$input.id, existing.input[[i]]$id) - newinput$dbfile.id <- c(newinput$dbfile.id, dbfile.id) - } else if (id_not_added) { - - # This is to tell input.insert if we are wrting ensembles - # Why does it need it ? bc it checks for inputs with the same time period, site and machine - # and if it returns somethings it does not insert anymore, but for ensembles it needs to bypass this condition - if (!is.null(ensemble) | is.null(ensemble_name)){ - ens.flag <- TRUE - }else{ - ens.flag <- FALSE - } - - new_entry <- dbfile.input.insert(in.path = dirname(result[[i]]$file[1]), - in.prefix = result[[i]]$dbfile.name[1], - siteid = site.id, - startdate = start_date, - enddate = end_date, - mimetype, - formatname, - parentid = parent.id, - con = con, - hostname = machine$hostname, - allow.conflicting.dates = allow.conflicting.dates, - ens=ens.flag - ) - - - newinput$input.id <- c(newinput$input.id, new_entry$input.id) - newinput$dbfile.id <- c(newinput$dbfile.id, new_entry$dbfile.id) - } - - } #End for loop - - successful <- TRUE - return(newinput) - } else { - PEcAn.logger::logger.warn("Input was not added to the database") - successful <- TRUE - return(NULL) - } + return (add.database.entries(result, con, start_date, + end_date, write, overwrite, + insert.new.file, input.args, + machine, mimetype, formatname, + allow.conflicting.dates, ensemble, + ensemble_name, existing.input, + existing.dbfile, input)) } # convert_input diff --git a/base/db/man/add.database.entries.Rd b/base/db/man/add.database.entries.Rd new file mode 100644 index 00000000000..5de01cd1705 --- /dev/null +++ b/base/db/man/add.database.entries.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add.database.entries.R +\name{add.database.entries} +\alias{add.database.entries} +\title{Return new arrangement of database while adding code to deal with ensembles} +\usage{ +add.database.entries( + result, + con, + start_date, + end_date, + write, + overwrite, + insert.new.file, + input.args, + machine, + mimetype, + formatname, + allow.conflicting.dates, + ensemble, + ensemble_name, + existing.input, + existing.dbfile, + input +) +} +\arguments{ +\item{result}{list of results from the download function} + +\item{con}{database connection} + +\item{start_date}{start date of the data} + +\item{end_date}{end date of the data} + +\item{write}{whether to write to the database} + +\item{overwrite}{Logical: If a file already exists, create a fresh copy?} + +\item{insert.new.file}{whether to insert a new file} + +\item{input.args}{input arguments obtained from the convert_input function} + +\item{machine}{machine information} + +\item{mimetype}{data product specific file format} + +\item{formatname}{format name of the data} + +\item{allow.conflicting.dates}{whether to allow conflicting dates} + +\item{ensemble}{ensemble id} + +\item{ensemble_name}{ensemble name} + +\item{existing.input}{existing input records} + +\item{existing.dbfile}{existing dbfile records} + +\item{input}{input records} +} +\value{ +list of input and dbfile ids +} +\description{ +Return new arrangement of database while adding code to deal with ensembles +} +\author{ +Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko +} diff --git a/base/db/man/check_missing_files.Rd b/base/db/man/check_missing_files.Rd new file mode 100644 index 00000000000..8dd541f9380 --- /dev/null +++ b/base/db/man/check_missing_files.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check.missing.files.R +\name{check_missing_files} +\alias{check_missing_files} +\title{Function to check if result has empty or missing files} +\usage{ +check_missing_files( + result, + outname, + existing.input = NULL, + existing.dbfile = NULL +) +} +\arguments{ +\item{result}{A list of dataframes with file paths} + +\item{outname}{Name of the output file} + +\item{existing.input}{Existing input records} + +\item{existing.dbfile}{Existing dbfile records} +} +\value{ +A list of dataframes with file paths, a list of strings with the output file name, a list of existing input records, and a list of existing dbfile records +} +\description{ +Function to check if result has empty or missing files +} +\author{ +Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko +} diff --git a/base/db/tests/testthat/test.check.missing.files.R b/base/db/tests/testthat/test.check.missing.files.R new file mode 100644 index 00000000000..e779077294a --- /dev/null +++ b/base/db/tests/testthat/test.check.missing.files.R @@ -0,0 +1,24 @@ +test_that("`check_missing_files()` able to return correct missing files", { + mocked_res <- mockery::mock(list(c("A", "B"))) + mockery::stub(check_missing_files, "purrr::map_dfr", data.frame(missing = c(FALSE), empty = c(FALSE))) + res <- check_missing_files( + result = list(data.frame(file = c("A", "B"))), + outname = "test", + existing.input = data.frame(), + existing.dbfile = data.frame() + ) + + # Print the structure of `res` for debugging + str(res) + + # This function returns a list as follows: return(list(result_sizes, outlist, existing.input, existing.dbfile)) + # Perform checks to compare results from stubbed functions to actual results + expect_equal(nrow(res[[1]]), 1) + expect_equal(res[[1]]$missing, FALSE) + expect_equal(res[[1]]$empty, FALSE) + expect_equal(res[[2]], "test") + expect_equal(nrow(res[[3]][[1]]), 0) + expect_equal(ncol(res[[3]][[1]]), 0) + expect_equal(nrow(res[[4]][[1]]), 0) + expect_equal(ncol(res[[4]][[1]]), 0) +}) diff --git a/base/db/tests/testthat/test.convert_input.R b/base/db/tests/testthat/test.convert_input.R index 29513187c9e..931d8a7f26b 100644 --- a/base/db/tests/testthat/test.convert_input.R +++ b/base/db/tests/testthat/test.convert_input.R @@ -1,10 +1,21 @@ test_that("`convert_input()` able to call the respective download function for a data item with the correct arguments", { mocked_res <- mockery::mock(list(c("A", "B"))) - mockery::stub(convert_input, 'dbfile.input.check', data.frame()) - mockery::stub(convert_input, 'db.query', data.frame(id = 1)) - mockery::stub(convert_input, 'PEcAn.remote::remote.execute.R', mocked_res) - mockery::stub(convert_input, 'purrr::map_dfr', data.frame(missing = c(FALSE), empty = c(FALSE))) + mockery::stub(convert_input, "dbfile.input.check", data.frame()) + mockery::stub(convert_input, "db.query", data.frame(id = 1)) + mockery::stub(convert_input, "PEcAn.remote::remote.execute.R", mocked_res) + mockery::stub(convert_input, "check_missing_files", list( + result_sizes = data.frame( + file = c("A", "B"), + file_size = c(100, 200), + missing = c(FALSE, FALSE), + empty = c(FALSE, FALSE) + ), + outlist = "test", + existing.input = list(data.frame(file = character(0))), + existing.dbfile = list(data.frame(file = character(0))) + )) + mockery::stub(convert_input, "add.database.entries", list(input.id = 1, dbfile.id = 1)) convert_input( input.id = NA, @@ -14,8 +25,8 @@ test_that("`convert_input()` able to call the respective download function for a site.id = 1, start_date = "2011-01-01", end_date = "2011-12-31", - pkg = 'PEcAn.data.atmosphere', - fcn = 'download.AmerifluxLBL', + pkg = "PEcAn.data.atmosphere", + fcn = "download.AmerifluxLBL", con = NULL, host = data.frame(name = "localhost"), browndog = NULL, @@ -23,10 +34,10 @@ test_that("`convert_input()` able to call the respective download function for a lat.in = 40, lon.in = -88 ) - + args <- mockery::mock_args(mocked_res) expect_equal( - args[[1]]$script, + args[[1]]$script, "PEcAn.data.atmosphere::download.AmerifluxLBL(lat.in=40, lon.in=-88, overwrite=FALSE, outfolder='test/', start_date='2011-01-01', end_date='2011-12-31')" ) }) @@ -36,4 +47,4 @@ test_that("`.get.file.deletion.commands()` able to return correct file deletion expect_equal(res$move.to.tmp, "dir.create(c('./tmp'), recursive=TRUE, showWarnings=FALSE); file.rename(from=c('test'), to=c('./tmp/test'))") expect_equal(res$delete.tmp, "unlink(c('./tmp'), recursive=TRUE)") expect_equal(res$replace.from.tmp, "file.rename(from=c('./tmp/test'), to=c('test'));unlink(c('./tmp'), recursive=TRUE)") -}) \ No newline at end of file +}) From 838af61ec8011022c9cf73e3a2f11f75f49f5492 Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Thu, 18 Jul 2024 13:17:33 +0530 Subject: [PATCH 02/14] Update CHANGELOG Signed-off-by: Abhinav Pandey --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2280cfc967e..e0c0bcbc731 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,5 @@ # Change Log + All notable changes are kept in this file. All changes made should be added to the section called `Unreleased`. Once a new release is made this file will be updated to create a new `Unreleased` section for the next release. @@ -9,6 +10,8 @@ For more information about this file see also [Keep a Changelog](http://keepacha ### Added +- Refactor `convert_input` to Perform tasks via helper function. Subtask of [#3307](https://github.com/PecanProject/pecan/issues/3307) + ### Fixed ### Changed From f22b962691ce03adf12c3e79907bd52372351b24 Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Thu, 25 Jul 2024 14:51:01 +0530 Subject: [PATCH 03/14] Remove unutilized variables from convert_input Signed-off-by: Abhinav Pandey --- base/db/R/add.database.entries.R | 12 +++++++++++- base/db/R/check.missing.files.R | 18 ++++++++---------- base/db/R/convert_input.R | 6 ++---- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/base/db/R/add.database.entries.R b/base/db/R/add.database.entries.R index 3c253c07e73..d3eb994a646 100644 --- a/base/db/R/add.database.entries.R +++ b/base/db/R/add.database.entries.R @@ -33,6 +33,7 @@ add.database.entries <- function( # Setup newinput. This list will contain two variables: a vector of input IDs and a vector of DB IDs for each entry in result. # This list will be returned. newinput <- list(input.id = NULL, dbfile.id = NULL) # Blank vectors are null. + for (i in 1:length(result)) { # Master for loop id_not_added <- TRUE @@ -72,7 +73,16 @@ add.database.entries <- function( } if (insert.new.file && id_not_added) { - dbfile.id <- dbfile.insert(in.path = dirname(result[[i]]$file[1]), in.prefix = result[[i]]$dbfile.name[1], "Input", existing.input[[i]]$id, con, reuse = TRUE, hostname = machine$hostname) + dbfile.id <- dbfile.insert( + in.path = dirname(result[[i]]$file[1]), + in.prefix = result[[i]]$dbfile.name[1], + "Input", + existing.input[[i]]$id, + con, + reuse = TRUE, + hostname = machine$hostname + ) + newinput$input.id <- c(newinput$input.id, existing.input[[i]]$id) newinput$dbfile.id <- c(newinput$dbfile.id, dbfile.id) } else if (id_not_added) { diff --git a/base/db/R/check.missing.files.R b/base/db/R/check.missing.files.R index bde3d7ebe97..617878496de 100644 --- a/base/db/R/check.missing.files.R +++ b/base/db/R/check.missing.files.R @@ -1,11 +1,11 @@ #' Function to check if result has empty or missing files -#' +#' #' @param result A list of dataframes with file paths #' @param outname Name of the output file #' @param existing.input Existing input records #' @param existing.dbfile Existing dbfile records #' @return A list of dataframes with file paths, a list of strings with the output file name, a list of existing input records, and a list of existing dbfile records -#' +#' #' @author Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko check_missing_files <- function(result, outname, existing.input = NULL, existing.dbfile = NULL) { @@ -18,14 +18,14 @@ check_missing_files <- function(result, outname, existing.input = NULL, existing empty = file_size == 0 ) ) - + if (any(result_sizes$missing) || any(result_sizes$empty)) { log_format_df <- function(df) { formatted_df <- rbind(colnames(df), format(df)) formatted_text <- purrr::reduce(formatted_df, paste, sep = " ") paste(formatted_text, collapse = "\n") } - + PEcAn.logger::logger.severe( "Requested Processing produced empty files or Nonexistent files:\n", log_format_df(result_sizes[, c(1, 8, 9, 10)]), @@ -33,17 +33,15 @@ check_missing_files <- function(result, outname, existing.input = NULL, existing wrap = FALSE ) } - - # Insert into Database - outlist <- unlist(strsplit(outname, "_")) - + + # Wrap in a list for consistant processing later if (exists("existing.input") && is.data.frame(existing.input)) { existing.input <- list(existing.input) } - + if (exists("existing.dbfile") && is.data.frame(existing.dbfile)) { existing.dbfile <- list(existing.dbfile) } - return(list(result_sizes, outlist, existing.input, existing.dbfile)) + return(list(existing.input, existing.dbfile)) } diff --git a/base/db/R/convert_input.R b/base/db/R/convert_input.R index d5af069d0eb..265559798be 100644 --- a/base/db/R/convert_input.R +++ b/base/db/R/convert_input.R @@ -737,10 +737,8 @@ convert_input <- checked.missing.files <- check_missing_files(result, outname, existing.input, existing.dbfile) # Unwrap parameters after performing checks for missing files - result_sizes <- checked.missing.files$result_sizes; - outlist <- checked.missing.files$outlist; - existing.input <- checked.missing.files$existing.input; - existing.dbfile <- checked.missing.files$existing.dbfile; + existing.input <- checked.missing.files$existing.input + existing.dbfile <- checked.missing.files$existing.dbfile #---------------------------------------------------------------# # New arrangement of database adding code to deal with ensembles. From d884203d1388b219268daa4e95b95b8134a5e69f Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Thu, 25 Jul 2024 14:58:55 +0530 Subject: [PATCH 04/14] Update logger statements in convert_input Signed-off-by: Abhinav Pandey --- base/db/R/convert_input.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/db/R/convert_input.R b/base/db/R/convert_input.R index 265559798be..275b6f54d49 100644 --- a/base/db/R/convert_input.R +++ b/base/db/R/convert_input.R @@ -514,7 +514,7 @@ convert_input <- # we'll need to update its start/end dates . } } else { - # No existing record found. Should be good to go. + PEcAn.logger::logger.debug("No existing record found. Should be good to go.") } } From 68d9516a3ccecb7c5c1b31907849b8fc7a3ba34e Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Thu, 25 Jul 2024 15:43:31 +0530 Subject: [PATCH 05/14] Added seperate function to check machine info Signed-off-by: Abhinav Pandey --- base/db/R/convert_input.R | 57 ++------------------------- base/db/R/get.machine.info.R | 68 +++++++++++++++++++++++++++++++++ base/db/man/get.machine.info.Rd | 26 +++++++++++++ 3 files changed, 98 insertions(+), 53 deletions(-) create mode 100644 base/db/R/get.machine.info.R create mode 100644 base/db/man/get.machine.info.Rd diff --git a/base/db/R/convert_input.R b/base/db/R/convert_input.R index 275b6f54d49..ad83753e299 100644 --- a/base/db/R/convert_input.R +++ b/base/db/R/convert_input.R @@ -520,60 +520,11 @@ convert_input <- #---------------------------------------------------------------------------------------------------------------# # Get machine information + machine.info <- get.machine.info(host, dbfile.id = input.args$dbfile.id, input.id = input.id) - machine.host <- ifelse(host$name == "localhost", PEcAn.remote::fqdn(), host$name) - machine <- db.query(paste0("SELECT * from machines where hostname = '", - machine.host, "'"), con) - - if (nrow(machine) == 0) { - PEcAn.logger::logger.error("machine not found", host$name) - return(NULL) - } - - if (missing(input.id) || is.na(input.id) || is.null(input.id)) { - input <- dbfile <- NULL - } else { - input <- db.query(paste("SELECT * from inputs where id =", input.id), con) - if (nrow(input) == 0) { - PEcAn.logger::logger.error("input not found", input.id) - return(NULL) - } - - if(!is.null(input.args$dbfile.id)){ - dbfile <- - db.query( - paste( - "SELECT * from dbfiles where id=",input.args$dbfile.id," and container_id =", - input.id, - " and container_type = 'Input' and machine_id =", - machine$id - ), - con - ) - }else{ - dbfile <- - db.query( - paste( - "SELECT * from dbfiles where container_id =", - input.id, - " and container_type = 'Input' and machine_id =", - machine$id - ), - con - ) - } - - - - if (nrow(dbfile) == 0) { - PEcAn.logger::logger.error("dbfile not found", input.id) - return(NULL) - } - if (nrow(dbfile) > 1) { - PEcAn.logger::logger.warn("multiple dbfile records, using last", dbfile) - dbfile <- dbfile[nrow(dbfile), ] - } - } + machine <- machine.info$machine + input <- machine.info$input + dbfile <- machine.info$dbfile #--------------------------------------------------------------------------------------------------# # Perform Conversion diff --git a/base/db/R/get.machine.info.R b/base/db/R/get.machine.info.R new file mode 100644 index 00000000000..6685e1062ef --- /dev/null +++ b/base/db/R/get.machine.info.R @@ -0,0 +1,68 @@ +#' Get machine information from db +#' @param host host information +#' @param dbfile.id dbfile id for existing records +#' @param input.id input id for existing records +#' @param con database connection +#' +#' @return list of machine, input, and dbfile records +#' @author Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko + +get.machine.info <- function(host, dbfile.id, input.id = NULL, con) { + machine.host <- ifelse(host$name == "localhost", PEcAn.remote::fqdn(), host$name) + machine <- db.query(paste0( + "SELECT * from machines where hostname = '", + machine.host, "'" + ), con) + + if (nrow(machine) == 0) { + PEcAn.logger::logger.error("machine not found", host$name) + return(NULL) + } + + if (missing(input.id) || is.na(input.id) || is.null(input.id)) { + input <- dbfile <- NULL + } else { + input <- db.query(paste("SELECT * from inputs where id =", input.id), con) + if (nrow(input) == 0) { + PEcAn.logger::logger.error("input not found", input.id) + return(NULL) + } + + if (!is.null(input.args$dbfile.id)) { + dbfile <- + db.query( + paste( + "SELECT * from dbfiles where id=", input.args$dbfile.id, " and container_id =", + input.id, + " and container_type = 'Input' and machine_id =", + machine$id + ), + con + ) + } else { + dbfile <- + db.query( + paste( + "SELECT * from dbfiles where container_id =", + input.id, + " and container_type = 'Input' and machine_id =", + machine$id + ), + con + ) + } + + + + if (nrow(dbfile) == 0) { + PEcAn.logger::logger.error("dbfile not found", input.id) + return(NULL) + } + if (nrow(dbfile) > 1) { + PEcAn.logger::logger.warn("multiple dbfile records, using last", dbfile) + dbfile <- dbfile[nrow(dbfile), ] + } + } + + return(list(machine = machine, input = input, dbfile = dbfile)) +} diff --git a/base/db/man/get.machine.info.Rd b/base/db/man/get.machine.info.Rd new file mode 100644 index 00000000000..8989221ea5b --- /dev/null +++ b/base/db/man/get.machine.info.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get.machine.info.R +\name{get.machine.info} +\alias{get.machine.info} +\title{Get machine information from db} +\usage{ +get.machine.info(host, dbfile.id, input.id = NULL, con) +} +\arguments{ +\item{host}{host information} + +\item{dbfile.id}{dbfile id for existing records} + +\item{input.id}{input id for existing records} + +\item{con}{database connection} +} +\value{ +list of machine, input, and dbfile records +} +\description{ +Get machine information from db +} +\author{ +Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko +} From 5208b02a1d98c27f24c8e4e9da56424537fa5852 Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Thu, 25 Jul 2024 16:40:41 +0530 Subject: [PATCH 06/14] Update input args to get machine info Signed-off-by: Abhinav Pandey --- base/db/R/convert_input.R | 2 +- base/db/R/get.machine.info.R | 2 +- base/db/tests/testthat/test.check.missing.files.R | 15 ++++++++------- base/db/tests/testthat/test.convert_input.R | 5 +++++ 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/base/db/R/convert_input.R b/base/db/R/convert_input.R index ad83753e299..ba2d7a3a5f0 100644 --- a/base/db/R/convert_input.R +++ b/base/db/R/convert_input.R @@ -520,7 +520,7 @@ convert_input <- #---------------------------------------------------------------------------------------------------------------# # Get machine information - machine.info <- get.machine.info(host, dbfile.id = input.args$dbfile.id, input.id = input.id) + machine.info <- get.machine.info(host, input.args = input.args, input.id = input.id) machine <- machine.info$machine input <- machine.info$input diff --git a/base/db/R/get.machine.info.R b/base/db/R/get.machine.info.R index 6685e1062ef..d23e5416f9e 100644 --- a/base/db/R/get.machine.info.R +++ b/base/db/R/get.machine.info.R @@ -7,7 +7,7 @@ #' @return list of machine, input, and dbfile records #' @author Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko -get.machine.info <- function(host, dbfile.id, input.id = NULL, con) { +get.machine.info <- function(host, input.args, input.id = NULL, con = NULL) { machine.host <- ifelse(host$name == "localhost", PEcAn.remote::fqdn(), host$name) machine <- db.query(paste0( "SELECT * from machines where hostname = '", diff --git a/base/db/tests/testthat/test.check.missing.files.R b/base/db/tests/testthat/test.check.missing.files.R index e779077294a..c2de074d5d3 100644 --- a/base/db/tests/testthat/test.check.missing.files.R +++ b/base/db/tests/testthat/test.check.missing.files.R @@ -14,11 +14,12 @@ test_that("`check_missing_files()` able to return correct missing files", { # This function returns a list as follows: return(list(result_sizes, outlist, existing.input, existing.dbfile)) # Perform checks to compare results from stubbed functions to actual results expect_equal(nrow(res[[1]]), 1) - expect_equal(res[[1]]$missing, FALSE) - expect_equal(res[[1]]$empty, FALSE) - expect_equal(res[[2]], "test") - expect_equal(nrow(res[[3]][[1]]), 0) - expect_equal(ncol(res[[3]][[1]]), 0) - expect_equal(nrow(res[[4]][[1]]), 0) - expect_equal(ncol(res[[4]][[1]]), 0) + PEcAn.logger::logger.debug(res) + # expect_equal(res[[1]]$missing, FALSE) + # expect_equal(res[[1]]$empty, FALSE) + # expect_equal(res[[2]], "test") + # expect_equal(nrow(res[[3]][[1]]), 0) + # expect_equal(ncol(res[[3]][[1]]), 0) + # expect_equal(nrow(res[[4]][[1]]), 0) + # expect_equal(ncol(res[[4]][[1]]), 0) }) diff --git a/base/db/tests/testthat/test.convert_input.R b/base/db/tests/testthat/test.convert_input.R index 931d8a7f26b..cd33523f86c 100644 --- a/base/db/tests/testthat/test.convert_input.R +++ b/base/db/tests/testthat/test.convert_input.R @@ -3,6 +3,11 @@ test_that("`convert_input()` able to call the respective download function for a mockery::stub(convert_input, "dbfile.input.check", data.frame()) mockery::stub(convert_input, "db.query", data.frame(id = 1)) + mockery::stub(convert_input, "get.machine.info", list( + machine = data.frame(id = 1), + input = data.frame(id = 1), + dbfile = data.frame(id = 1) + )) mockery::stub(convert_input, "PEcAn.remote::remote.execute.R", mocked_res) mockery::stub(convert_input, "check_missing_files", list( result_sizes = data.frame( From f570646849433f89d8335b25be2539bc3c2ae4bb Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Thu, 25 Jul 2024 16:59:21 +0530 Subject: [PATCH 07/14] Correct roxygen documentations Signed-off-by: Abhinav Pandey --- base/db/R/get.machine.info.R | 2 +- base/db/man/get.machine.info.Rd | 4 ++-- base/db/tests/testthat/test.check.missing.files.R | 15 +++++++-------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/base/db/R/get.machine.info.R b/base/db/R/get.machine.info.R index d23e5416f9e..4683cde1573 100644 --- a/base/db/R/get.machine.info.R +++ b/base/db/R/get.machine.info.R @@ -1,6 +1,6 @@ #' Get machine information from db #' @param host host information -#' @param dbfile.id dbfile id for existing records +#' @param input.args input args.r for existing records #' @param input.id input id for existing records #' @param con database connection #' diff --git a/base/db/man/get.machine.info.Rd b/base/db/man/get.machine.info.Rd index 8989221ea5b..6e57013c4d7 100644 --- a/base/db/man/get.machine.info.Rd +++ b/base/db/man/get.machine.info.Rd @@ -4,12 +4,12 @@ \alias{get.machine.info} \title{Get machine information from db} \usage{ -get.machine.info(host, dbfile.id, input.id = NULL, con) +get.machine.info(host, input.args, input.id = NULL, con = NULL) } \arguments{ \item{host}{host information} -\item{dbfile.id}{dbfile id for existing records} +\item{input.args}{input args for existing records} \item{input.id}{input id for existing records} diff --git a/base/db/tests/testthat/test.check.missing.files.R b/base/db/tests/testthat/test.check.missing.files.R index c2de074d5d3..e779077294a 100644 --- a/base/db/tests/testthat/test.check.missing.files.R +++ b/base/db/tests/testthat/test.check.missing.files.R @@ -14,12 +14,11 @@ test_that("`check_missing_files()` able to return correct missing files", { # This function returns a list as follows: return(list(result_sizes, outlist, existing.input, existing.dbfile)) # Perform checks to compare results from stubbed functions to actual results expect_equal(nrow(res[[1]]), 1) - PEcAn.logger::logger.debug(res) - # expect_equal(res[[1]]$missing, FALSE) - # expect_equal(res[[1]]$empty, FALSE) - # expect_equal(res[[2]], "test") - # expect_equal(nrow(res[[3]][[1]]), 0) - # expect_equal(ncol(res[[3]][[1]]), 0) - # expect_equal(nrow(res[[4]][[1]]), 0) - # expect_equal(ncol(res[[4]][[1]]), 0) + expect_equal(res[[1]]$missing, FALSE) + expect_equal(res[[1]]$empty, FALSE) + expect_equal(res[[2]], "test") + expect_equal(nrow(res[[3]][[1]]), 0) + expect_equal(ncol(res[[3]][[1]]), 0) + expect_equal(nrow(res[[4]][[1]]), 0) + expect_equal(ncol(res[[4]][[1]]), 0) }) From e479c468f1fcca1c02dec919e4fabca1dcbf792e Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Thu, 25 Jul 2024 17:23:15 +0530 Subject: [PATCH 08/14] Update tests Signed-off-by: Abhinav Pandey --- base/db/R/get.machine.info.R | 105 +++++++++--------- .../tests/testthat/test.check.missing.files.R | 23 ++-- 2 files changed, 61 insertions(+), 67 deletions(-) diff --git a/base/db/R/get.machine.info.R b/base/db/R/get.machine.info.R index 4683cde1573..c98bee6cf20 100644 --- a/base/db/R/get.machine.info.R +++ b/base/db/R/get.machine.info.R @@ -1,6 +1,6 @@ #' Get machine information from db #' @param host host information -#' @param input.args input args.r for existing records +#' @param input.args input args for existing records #' @param input.id input id for existing records #' @param con database connection #' @@ -8,61 +8,60 @@ #' @author Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko get.machine.info <- function(host, input.args, input.id = NULL, con = NULL) { - machine.host <- ifelse(host$name == "localhost", PEcAn.remote::fqdn(), host$name) - machine <- db.query(paste0( - "SELECT * from machines where hostname = '", - machine.host, "'" - ), con) - - if (nrow(machine) == 0) { - PEcAn.logger::logger.error("machine not found", host$name) - return(NULL) + + machine.host <- ifelse(host$name == "localhost", PEcAn.remote::fqdn(), host$name) + machine <- db.query(paste0("SELECT * from machines where hostname = '", + machine.host, "'"), con) + + if (nrow(machine) == 0) { + PEcAn.logger::logger.error("machine not found", host$name) + return(NULL) + } + + if (missing(input.id) || is.na(input.id) || is.null(input.id)) { + input <- dbfile <- NULL + } else { + input <- db.query(paste("SELECT * from inputs where id =", input.id), con) + if (nrow(input) == 0) { + PEcAn.logger::logger.error("input not found", input.id) + return(NULL) } + + if(!is.null(input.args$dbfile.id)){ + dbfile <- + db.query( + paste( + "SELECT * from dbfiles where id=",input.args$dbfile.id," and container_id =", + input.id, + " and container_type = 'Input' and machine_id =", + machine$id + ), + con + ) + }else{ + dbfile <- + db.query( + paste( + "SELECT * from dbfiles where container_id =", + input.id, + " and container_type = 'Input' and machine_id =", + machine$id + ), + con + ) + } + - if (missing(input.id) || is.na(input.id) || is.null(input.id)) { - input <- dbfile <- NULL - } else { - input <- db.query(paste("SELECT * from inputs where id =", input.id), con) - if (nrow(input) == 0) { - PEcAn.logger::logger.error("input not found", input.id) - return(NULL) - } - - if (!is.null(input.args$dbfile.id)) { - dbfile <- - db.query( - paste( - "SELECT * from dbfiles where id=", input.args$dbfile.id, " and container_id =", - input.id, - " and container_type = 'Input' and machine_id =", - machine$id - ), - con - ) - } else { - dbfile <- - db.query( - paste( - "SELECT * from dbfiles where container_id =", - input.id, - " and container_type = 'Input' and machine_id =", - machine$id - ), - con - ) - } - - - - if (nrow(dbfile) == 0) { - PEcAn.logger::logger.error("dbfile not found", input.id) - return(NULL) - } - if (nrow(dbfile) > 1) { - PEcAn.logger::logger.warn("multiple dbfile records, using last", dbfile) - dbfile <- dbfile[nrow(dbfile), ] - } + + if (nrow(dbfile) == 0) { + PEcAn.logger::logger.error("dbfile not found", input.id) + return(NULL) + } + if (nrow(dbfile) > 1) { + PEcAn.logger::logger.warn("multiple dbfile records, using last", dbfile) + dbfile <- dbfile[nrow(dbfile), ] } + } return(list(machine = machine, input = input, dbfile = dbfile)) } diff --git a/base/db/tests/testthat/test.check.missing.files.R b/base/db/tests/testthat/test.check.missing.files.R index e779077294a..c0ad6794f65 100644 --- a/base/db/tests/testthat/test.check.missing.files.R +++ b/base/db/tests/testthat/test.check.missing.files.R @@ -1,24 +1,19 @@ test_that("`check_missing_files()` able to return correct missing files", { - mocked_res <- mockery::mock(list(c("A", "B"))) - mockery::stub(check_missing_files, "purrr::map_dfr", data.frame(missing = c(FALSE), empty = c(FALSE))) + # Mock `purrr::map_dfr` + mocked_res <- mockery::mock(data.frame(file = c("A", "B"), file_size = c(100, 200), missing = c(FALSE, FALSE), empty = c(FALSE, FALSE))) + mockery::stub(check_missing_files, "purrr::map_dfr", mocked_res) + res <- check_missing_files( result = list(data.frame(file = c("A", "B"))), outname = "test", existing.input = data.frame(), existing.dbfile = data.frame() ) - + # Print the structure of `res` for debugging str(res) - - # This function returns a list as follows: return(list(result_sizes, outlist, existing.input, existing.dbfile)) - # Perform checks to compare results from stubbed functions to actual results - expect_equal(nrow(res[[1]]), 1) - expect_equal(res[[1]]$missing, FALSE) - expect_equal(res[[1]]$empty, FALSE) - expect_equal(res[[2]], "test") - expect_equal(nrow(res[[3]][[1]]), 0) - expect_equal(ncol(res[[3]][[1]]), 0) - expect_equal(nrow(res[[4]][[1]]), 0) - expect_equal(ncol(res[[4]][[1]]), 0) + + expect_equal(length(res), 2) + expect_true(is.list(res[[1]])) + expect_true(is.list(res[[2]])) }) From 63f270f89618abe745c3502587a24b49762e30a8 Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Wed, 14 Aug 2024 23:18:53 +0530 Subject: [PATCH 09/14] Refactor extra variables in `run.meta.anbalysis` Signed-off-by: Abhinav Pandey --- modules/meta.analysis/R/run.meta.analysis.R | 34 ++++++++++++--------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/modules/meta.analysis/R/run.meta.analysis.R b/modules/meta.analysis/R/run.meta.analysis.R index 65afcdf61bd..4f8841ae46b 100644 --- a/modules/meta.analysis/R/run.meta.analysis.R +++ b/modules/meta.analysis/R/run.meta.analysis.R @@ -216,22 +216,26 @@ runModule.run.meta.analysis <- function(settings) { PEcAn.logger::logger.info(paste0("Running meta-analysis on all PFTs listed by any Settings object in the list: ", paste(pft.names, collapse = ", "))) - iterations <- settings$meta.analysis$iter - random <- settings$meta.analysis$random.effects$on - use_ghs <- settings$meta.analysis$random.effects$use_ghs - threshold <- settings$meta.analysis$threshold - dbfiles <- settings$database$dbfiles - database <- settings$database$bety - run.meta.analysis(pfts, iterations, random, threshold, dbfiles, database, use_ghs) + run.meta.analysis( + pfts, + settings$meta.analysis$iter, + settings$meta.analysis$random.effects$on, + settings$meta.analysis$threshold, + settings$database$dbfiles, + settings$database$bety, + settings$meta.analysis$random.effects$use_ghs + ) } else if (PEcAn.settings::is.Settings(settings)) { - pfts <- settings$pfts - iterations <- settings$meta.analysis$iter - random <- settings$meta.analysis$random.effects$on - use_ghs <- settings$meta.analysis$random.effects$use_ghs - threshold <- settings$meta.analysis$threshold - dbfiles <- settings$database$dbfiles - database <- settings$database$bety - run.meta.analysis(pfts, iterations, random, threshold, dbfiles, database, use_ghs, update = settings$meta.analysis$update) + run.meta.analysis( + settings$pfts, + settings$meta.analysis$iter, + settings$meta.analysis$random.effects$on, + settings$meta.analysis$threshold, + settings$database$dbfiles, + settings$database$bety, + settings$meta.analysis$random.effects$use_ghs, + update = settings$meta.analysis$update + ) } else { stop("runModule.run.meta.analysis only works with Settings or MultiSettings") } From 74003d9582e8ec0a99f303f9b3c8e4f4777298ac Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Wed, 21 Aug 2024 20:24:46 +0530 Subject: [PATCH 10/14] get existing machine info using helper function Signed-off-by: Abhinav Pandey --- base/db/R/convert_input.R | 26 +++++++++--------- base/db/R/get.machine.info.R | 48 ++++++++++++++++++++++----------- base/db/man/get.machine.host.Rd | 22 +++++++++++++++ 3 files changed, 66 insertions(+), 30 deletions(-) create mode 100644 base/db/man/get.machine.host.Rd diff --git a/base/db/R/convert_input.R b/base/db/R/convert_input.R index ba2d7a3a5f0..9cc5c8f3c03 100644 --- a/base/db/R/convert_input.R +++ b/base/db/R/convert_input.R @@ -176,17 +176,15 @@ convert_input <- # Date/time processing for existing input existing.input[[i]]$start_date <- lubridate::force_tz(lubridate::as_datetime(existing.input[[i]]$start_date), "UTC") existing.input[[i]]$end_date <- lubridate::force_tz(lubridate::as_datetime(existing.input[[i]]$end_date), "UTC") - + ## Obtain machine information + machine.host.info <- get.machine.host(host, con = con) + machine.host <- machine.host.info$machine.host + machine <- machine.host.info$machine #Grab machine info of file that exists existing.machine <- db.query(paste0("SELECT * from machines where id = '", existing.dbfile[[i]]$machine_id, "'"), con) - #Grab machine info of host machine - machine.host <- ifelse(host$name == "localhost", PEcAn.remote::fqdn(), host$name) - machine <- db.query(paste0("SELECT * from machines where hostname = '", - machine.host, "'"), con) - # If the files aren't on the machine, we have to download them, so "overwrite" is meaningless. if (existing.machine$id == machine$id) { @@ -353,9 +351,9 @@ convert_input <- existing.dbfile$machine_id, "'"), con) #Grab machine info of host machine - machine.host <- ifelse(host$name == "localhost", PEcAn.remote::fqdn(), host$name) - machine <- db.query(paste0("SELECT * from machines where hostname = '", - machine.host, "'"), con) + machine.host.info <- get.machine.host(host, con = con) + machine.host <- machine.host.info$machine.host + machine <- machine.host.info$machine if (existing.machine$id != machine$id) { @@ -475,11 +473,11 @@ convert_input <- existing.machine <- db.query(paste0("SELECT * from machines where id = '", existing.dbfile$machine_id, "'"), con) - #Grab machine info of - machine.host <- ifelse(host$name == "localhost", PEcAn.remote::fqdn(), host$name) - machine <- db.query(paste0("SELECT * from machines where hostname = '", - machine.host, "'"), con) - + #Grab machine info of host machine + machine.host.info <- get.machine.host(host, con = con) + machine.host <- machine.host.info$machine.host + machine <- machine.host.info$machine + if(existing.machine$id != machine$id){ PEcAn.logger::logger.info("Valid Input record found that spans desired dates, but valid files do not exist on this machine.") PEcAn.logger::logger.info("Downloading all years of Valid input to ensure consistency") diff --git a/base/db/R/get.machine.info.R b/base/db/R/get.machine.info.R index c98bee6cf20..979b1f6bb33 100644 --- a/base/db/R/get.machine.info.R +++ b/base/db/R/get.machine.info.R @@ -3,21 +3,21 @@ #' @param input.args input args for existing records #' @param input.id input id for existing records #' @param con database connection -#' +#' #' @return list of machine, input, and dbfile records #' @author Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko get.machine.info <- function(host, input.args, input.id = NULL, con = NULL) { - - machine.host <- ifelse(host$name == "localhost", PEcAn.remote::fqdn(), host$name) - machine <- db.query(paste0("SELECT * from machines where hostname = '", - machine.host, "'"), con) - + + machine.host.info <- get.machine.host(host, con = con) + machine.host <- machine.host.info$machine.host + machine <- machine.host.info$machine + if (nrow(machine) == 0) { PEcAn.logger::logger.error("machine not found", host$name) return(NULL) } - + if (missing(input.id) || is.na(input.id) || is.null(input.id)) { input <- dbfile <- NULL } else { @@ -26,19 +26,19 @@ get.machine.info <- function(host, input.args, input.id = NULL, con = NULL) { PEcAn.logger::logger.error("input not found", input.id) return(NULL) } - - if(!is.null(input.args$dbfile.id)){ + + if (!is.null(input.args$dbfile.id)) { dbfile <- db.query( paste( - "SELECT * from dbfiles where id=",input.args$dbfile.id," and container_id =", + "SELECT * from dbfiles where id=", input.args$dbfile.id, " and container_id =", input.id, " and container_type = 'Input' and machine_id =", machine$id ), con - ) - }else{ + ) + } else { dbfile <- db.query( paste( @@ -48,11 +48,11 @@ get.machine.info <- function(host, input.args, input.id = NULL, con = NULL) { machine$id ), con - ) + ) } - - + + if (nrow(dbfile) == 0) { PEcAn.logger::logger.error("dbfile not found", input.id) return(NULL) @@ -63,5 +63,21 @@ get.machine.info <- function(host, input.args, input.id = NULL, con = NULL) { } } - return(list(machine = machine, input = input, dbfile = dbfile)) + return(list(machine = machine, input = input, dbfile = dbfile)) +} + +#' Helper Function to retrieve machine host and machine informations +#' @param host host information +#' @param con database connection +#' @return list of machine host and machine information +#' @author Abhinav Pandey +get.machine.host <- function(host, con = NULL) { + #Grab machine info of host machine + machine.host <- ifelse(host$name == "localhost", PEcAn.remote::fqdn(), host$name) + machine <- db.query(paste0( + "SELECT * from machines where hostname = '", + machine.host, "'" + ), con) + + return(list(machine.host, machine)) } diff --git a/base/db/man/get.machine.host.Rd b/base/db/man/get.machine.host.Rd new file mode 100644 index 00000000000..926035dec0c --- /dev/null +++ b/base/db/man/get.machine.host.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get.machine.info.R +\name{get.machine.host} +\alias{get.machine.host} +\title{Helper Function to retrieve machine host and machine informations} +\usage{ +get.machine.host(host, con = NULL) +} +\arguments{ +\item{host}{host information} + +\item{con}{database connection} +} +\value{ +list of machine host and machine information +} +\description{ +Helper Function to retrieve machine host and machine informations +} +\author{ +Abhinav Pandey +} From a578be2dfc274c10a100c90e1febf1474d5289f7 Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Wed, 9 Oct 2024 11:45:13 +0530 Subject: [PATCH 11/14] Applied changes as suggested by @infotroph Signed-off-by: Abhinav Pandey --- base/db/R/add.database.entries.R | 161 ++++++++++++++++--------------- base/db/R/convert_input.R | 22 +++-- 2 files changed, 100 insertions(+), 83 deletions(-) diff --git a/base/db/R/add.database.entries.R b/base/db/R/add.database.entries.R index d3eb994a646..8b36e884398 100644 --- a/base/db/R/add.database.entries.R +++ b/base/db/R/add.database.entries.R @@ -23,99 +23,108 @@ add.database.entries <- function( result, con, start_date, - end_date, write, overwrite, + end_date, overwrite, insert.new.file, input.args, machine, mimetype, formatname, allow.conflicting.dates, ensemble, ensemble_name, existing.input, existing.dbfile, input) { - if (write) { - # Setup newinput. This list will contain two variables: a vector of input IDs and a vector of DB IDs for each entry in result. - # This list will be returned. - newinput <- list(input.id = NULL, dbfile.id = NULL) # Blank vectors are null. + # Setup newinput. This list will contain two variables: a vector of input IDs and a vector of DB IDs for each entry in result. + # This list will be returned. + newinput <- list(input.id = NULL, dbfile.id = NULL) # Blank vectors are null. - for (i in 1:length(result)) { # Master for loop - id_not_added <- TRUE + for (i in 1:length(result)) { # Master for loop + id_not_added <- TRUE - if (!is.null(existing.input) && nrow(existing.input[[i]]) > 0 && - (existing.input[[i]]$start_date != start_date || existing.input[[i]]$end_date != end_date)) { - # Updating record with new dates - db.query(paste0("UPDATE inputs SET start_date='", start_date, "', end_date='", end_date, "' WHERE id=", existing.input[[i]]$id), con) - id_not_added <- FALSE + if (!is.null(existing.input) && nrow(existing.input[[i]]) > 0 && + (existing.input[[i]]$start_date != start_date || existing.input[[i]]$end_date != end_date)) { + # Updating record with new dates + db.query( + paste0( + "UPDATE inputs SET start_date='", start_date, "', end_date='", end_date, + "' WHERE id=", existing.input[[i]]$id + ), + con + ) + id_not_added <- FALSE - # The overall structure of this loop has been set up so that exactly one input.id and one dbfile.id will be written to newinput every iteration. - newinput$input.id <- c(newinput$input.id, existing.input[[i]]$id) - newinput$dbfile.id <- c(newinput$dbfile.id, existing.dbfile[[i]]$id) - } - - if (overwrite) { - # A bit hacky, but need to make sure that all fields are updated to expected values (i.e., what they'd be if convert_input was creating a new record) - if (!is.null(existing.input) && nrow(existing.input[[i]]) > 0) { - db.query(paste0("UPDATE inputs SET name='", basename(dirname(result[[i]]$file[1])), "' WHERE id=", existing.input[[i]]$id), con) - } + # The overall structure of this loop has been set up so that exactly one input.id and one dbfile.id will be written to newinput every iteration. + newinput$input.id <- c(newinput$input.id, existing.input[[i]]$id) + newinput$dbfile.id <- c(newinput$dbfile.id, existing.dbfile[[i]]$id) + } - if (!is.null(existing.dbfile) && nrow(existing.dbfile[[i]]) > 0) { - db.query(paste0("UPDATE dbfiles SET file_path='", dirname(result[[i]]$file[1]), "', file_name='", result[[i]]$dbfile.name[1], "' WHERE id=", existing.dbfile[[i]]$id), con) - } + if (overwrite) { + # A bit hacky, but need to make sure that all fields are updated to expected values (i.e., what they'd be if convert_input was creating a new record) + if (!is.null(existing.input) && nrow(existing.input[[i]]) > 0) { + db.query( + paste0( + "UPDATE dbfiles SET file_path='", dirname(result[[i]]$file[1]), + "', file_name='", result[[i]]$dbfile.name[1], + "' WHERE id=", existing.dbfile[[i]]$id + ), + con + ) } - # If there is no ensemble then for each record there should be one parent - # But when you have ensembles, all of the members have one parent !! - parent.id <- if (is.numeric(ensemble)) { - ifelse(is.null(input[[i]]), NA, input[[1]]$id) - } else { - ifelse(is.null(input[[i]]), NA, input[[i]]$id) + if (!is.null(existing.dbfile) && nrow(existing.dbfile[[i]]) > 0) { + db.query(paste0( + "UPDATE dbfiles SET file_path='", dirname(result[[i]]$file[1]), + "', file_name='", result[[i]]$dbfile.name[1], + "' WHERE id=", existing.dbfile[[i]]$id + ), con) } + } + # If there is no ensemble then for each record there should be one parent + # But when you have ensembles, all of the members have one parent !! + parent.id <- if (is.numeric(ensemble)) { + ifelse(is.null(input[[i]]), NA, input[[1]]$id) + } else { + ifelse(is.null(input[[i]]), NA, input[[i]]$id) + } - if ("newsite" %in% names(input.args) && !is.null(input.args[["newsite"]])) { - site.id <- input.args$newsite - } - if (insert.new.file && id_not_added) { - dbfile.id <- dbfile.insert( - in.path = dirname(result[[i]]$file[1]), - in.prefix = result[[i]]$dbfile.name[1], - "Input", - existing.input[[i]]$id, - con, - reuse = TRUE, - hostname = machine$hostname - ) + if ("newsite" %in% names(input.args) && !is.null(input.args[["newsite"]])) { + site.id <- input.args$newsite + } - newinput$input.id <- c(newinput$input.id, existing.input[[i]]$id) - newinput$dbfile.id <- c(newinput$dbfile.id, dbfile.id) - } else if (id_not_added) { - # This is to tell input.insert if we are writing ensembles - # Why does it need it? Because it checks for inputs with the same time period, site, and machine - # and if it returns something it does not insert anymore, but for ensembles, it needs to bypass this condition - ens.flag <- if (!is.null(ensemble) | is.null(ensemble_name)) TRUE else FALSE + if (insert.new.file && id_not_added) { + dbfile.id <- dbfile.insert( + in.path = dirname(result[[i]]$file[1]), + in.prefix = result[[i]]$dbfile.name[1], + "Input", + existing.input[[i]]$id, + con, + reuse = TRUE, + hostname = machine$hostname + ) - new_entry <- dbfile.input.insert( - in.path = dirname(result[[i]]$file[1]), - in.prefix = result[[i]]$dbfile.name[1], - siteid = site.id, - startdate = start_date, - enddate = end_date, - mimetype = mimetype, - formatname = formatname, - parentid = parent.id, - con = con, - hostname = machine$hostname, - allow.conflicting.dates = allow.conflicting.dates, - ens = ens.flag - ) + newinput$input.id <- c(newinput$input.id, existing.input[[i]]$id) + newinput$dbfile.id <- c(newinput$dbfile.id, dbfile.id) + } else if (id_not_added) { + # This is to tell input.insert if we are writing ensembles + # Why does it need it? Because it checks for inputs with the same time period, site, and machine + # and if it returns something it does not insert anymore, but for ensembles, it needs to bypass this condition + ens.flag <- if (!is.null(ensemble) || is.null(ensemble_name)) TRUE else FALSE - newinput$input.id <- c(newinput$input.id, new_entry$input.id) - newinput$dbfile.id <- c(newinput$dbfile.id, new_entry$dbfile.id) - } - } # End for loop + new_entry <- dbfile.input.insert( + in.path = dirname(result[[i]]$file[1]), + in.prefix = result[[i]]$dbfile.name[1], + siteid = site.id, + startdate = start_date, + enddate = end_date, + mimetype = mimetype, + formatname = formatname, + parentid = parent.id, + con = con, + hostname = machine$hostname, + allow.conflicting.dates = allow.conflicting.dates, + ens = ens.flag + ) - successful <- TRUE - return(newinput) - } else { - PEcAn.logger::logger.warn("Input was not added to the database") - successful <- TRUE - return(NULL) - } + newinput$input.id <- c(newinput$input.id, new_entry$input.id) + newinput$dbfile.id <- c(newinput$dbfile.id, new_entry$dbfile.id) + } + } # End for loop + return(newinput) } diff --git a/base/db/R/convert_input.R b/base/db/R/convert_input.R index 8828d069d6c..ed267440fbc 100644 --- a/base/db/R/convert_input.R +++ b/base/db/R/convert_input.R @@ -591,13 +591,21 @@ convert_input <- #---------------------------------------------------------------# # New arrangement of database adding code to deal with ensembles. - return (add.database.entries(result, con, start_date, - end_date, write, overwrite, - insert.new.file, input.args, - machine, mimetype, formatname, - allow.conflicting.dates, ensemble, - ensemble_name, existing.input, - existing.dbfile, input)) + if(write) { + add_entries_result <- return (add.database.entries(result, con, start_date, + end_date, overwrite, + insert.new.file, input.args, + machine, mimetype, formatname, + allow.conflicting.dates, ensemble, + ensemble_name, existing.input, + existing.dbfile, input)) + } else { + PEcAn.logger::logger.warn("Input was not added to the database") + successful <- TRUE + return(NULL) + } + successful <- TRUE + return (add_entries_result) } # convert_input From 293a68befdc9452b2011da4f6320da502c91b79d Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Wed, 9 Oct 2024 11:55:21 +0530 Subject: [PATCH 12/14] Minor review changes Signed-off-by: Abhinav Pandey --- base/db/R/check.missing.files.R | 8 ++++---- base/db/R/convert_input.R | 2 +- base/db/tests/testthat/test.check.missing.files.R | 1 - 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/base/db/R/check.missing.files.R b/base/db/R/check.missing.files.R index 617878496de..29ce044f68c 100644 --- a/base/db/R/check.missing.files.R +++ b/base/db/R/check.missing.files.R @@ -8,7 +8,7 @@ #' #' @author Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko -check_missing_files <- function(result, outname, existing.input = NULL, existing.dbfile = NULL) { +check_missing_files <- function(result, existing.input = NULL, existing.dbfile = NULL) { result_sizes <- purrr::map_dfr( result, ~ dplyr::mutate( @@ -35,12 +35,12 @@ check_missing_files <- function(result, outname, existing.input = NULL, existing } - # Wrap in a list for consistant processing later - if (exists("existing.input") && is.data.frame(existing.input)) { + # Wrap in a list for consistent processing later + if (is.data.frame(existing.input)) { existing.input <- list(existing.input) } - if (exists("existing.dbfile") && is.data.frame(existing.dbfile)) { + if (is.data.frame(existing.dbfile)) { existing.dbfile <- list(existing.dbfile) } return(list(existing.input, existing.dbfile)) diff --git a/base/db/R/convert_input.R b/base/db/R/convert_input.R index ed267440fbc..a074a689389 100644 --- a/base/db/R/convert_input.R +++ b/base/db/R/convert_input.R @@ -583,7 +583,7 @@ convert_input <- #--------------------------------------------------------------------------------------------------# # Check if result has empty or missing files - checked.missing.files <- check_missing_files(result, outname, existing.input, existing.dbfile) + checked.missing.files <- check_missing_files(result, existing.input, existing.dbfile) # Unwrap parameters after performing checks for missing files existing.input <- checked.missing.files$existing.input diff --git a/base/db/tests/testthat/test.check.missing.files.R b/base/db/tests/testthat/test.check.missing.files.R index c0ad6794f65..bc61bb1ad4a 100644 --- a/base/db/tests/testthat/test.check.missing.files.R +++ b/base/db/tests/testthat/test.check.missing.files.R @@ -5,7 +5,6 @@ test_that("`check_missing_files()` able to return correct missing files", { res <- check_missing_files( result = list(data.frame(file = c("A", "B"))), - outname = "test", existing.input = data.frame(), existing.dbfile = data.frame() ) From f7f6926fa14c5c5e8ee776b74e0ac5fd77d56048 Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Wed, 9 Oct 2024 11:55:36 +0530 Subject: [PATCH 13/14] Update base/db/R/get.machine.info.R Co-authored-by: Chris Black --- base/db/R/get.machine.info.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/db/R/get.machine.info.R b/base/db/R/get.machine.info.R index 979b1f6bb33..31f489daddc 100644 --- a/base/db/R/get.machine.info.R +++ b/base/db/R/get.machine.info.R @@ -7,7 +7,7 @@ #' @return list of machine, input, and dbfile records #' @author Betsy Cowdery, Michael Dietze, Ankur Desai, Tony Gardella, Luke Dramko -get.machine.info <- function(host, input.args, input.id = NULL, con = NULL) { +get_machine_info <- function(host, input.args, input.id = NULL, con = NULL) { machine.host.info <- get.machine.host(host, con = con) machine.host <- machine.host.info$machine.host From 8f820b027cb7fb5da70f8f66e8f6e88abd1d4f8b Mon Sep 17 00:00:00 2001 From: Abhinav Pandey Date: Wed, 9 Oct 2024 11:56:54 +0530 Subject: [PATCH 14/14] Apply suggestions from code review Co-authored-by: Chris Black --- base/db/R/check.missing.files.R | 1 - base/db/R/convert_input.R | 6 +++++- base/db/R/get.machine.info.R | 4 ++-- base/db/tests/testthat/test.check.missing.files.R | 6 ++---- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/base/db/R/check.missing.files.R b/base/db/R/check.missing.files.R index 29ce044f68c..f3a496cf5de 100644 --- a/base/db/R/check.missing.files.R +++ b/base/db/R/check.missing.files.R @@ -1,7 +1,6 @@ #' Function to check if result has empty or missing files #' #' @param result A list of dataframes with file paths -#' @param outname Name of the output file #' @param existing.input Existing input records #' @param existing.dbfile Existing dbfile records #' @return A list of dataframes with file paths, a list of strings with the output file name, a list of existing input records, and a list of existing dbfile records diff --git a/base/db/R/convert_input.R b/base/db/R/convert_input.R index a074a689389..042c9da08db 100644 --- a/base/db/R/convert_input.R +++ b/base/db/R/convert_input.R @@ -380,7 +380,7 @@ convert_input <- if (!is.null(ensemble) && ensemble) { return.all <-TRUE - } else{ + } else { return.all <- FALSE } existing.dbfile <- dbfile.input.check(siteid = site.id, @@ -518,6 +518,10 @@ convert_input <- # Get machine information machine.info <- get.machine.info(host, input.args = input.args, input.id = input.id) + if (any(sapply(machine.info, is.null))) { + PEcAn.logger::logger.error("failed lookup of inputs or dbfiles") + return(NULL) + } machine <- machine.info$machine input <- machine.info$input dbfile <- machine.info$dbfile diff --git a/base/db/R/get.machine.info.R b/base/db/R/get.machine.info.R index 31f489daddc..14123a586e9 100644 --- a/base/db/R/get.machine.info.R +++ b/base/db/R/get.machine.info.R @@ -18,7 +18,7 @@ get_machine_info <- function(host, input.args, input.id = NULL, con = NULL) { return(NULL) } - if (missing(input.id) || is.na(input.id) || is.null(input.id)) { + if (is.na(input.id) || is.null(input.id)) { input <- dbfile <- NULL } else { input <- db.query(paste("SELECT * from inputs where id =", input.id), con) @@ -71,7 +71,7 @@ get_machine_info <- function(host, input.args, input.id = NULL, con = NULL) { #' @param con database connection #' @return list of machine host and machine information #' @author Abhinav Pandey -get.machine.host <- function(host, con = NULL) { +get_machine_host <- function(host, con) { #Grab machine info of host machine machine.host <- ifelse(host$name == "localhost", PEcAn.remote::fqdn(), host$name) machine <- db.query(paste0( diff --git a/base/db/tests/testthat/test.check.missing.files.R b/base/db/tests/testthat/test.check.missing.files.R index bc61bb1ad4a..75a531283dd 100644 --- a/base/db/tests/testthat/test.check.missing.files.R +++ b/base/db/tests/testthat/test.check.missing.files.R @@ -1,7 +1,7 @@ test_that("`check_missing_files()` able to return correct missing files", { # Mock `purrr::map_dfr` - mocked_res <- mockery::mock(data.frame(file = c("A", "B"), file_size = c(100, 200), missing = c(FALSE, FALSE), empty = c(FALSE, FALSE))) - mockery::stub(check_missing_files, "purrr::map_dfr", mocked_res) + mocked_size <- mockery::mock(100,200) + mockery::stub(check_missing_files, "file.size", mocked_res) res <- check_missing_files( result = list(data.frame(file = c("A", "B"))), @@ -9,8 +9,6 @@ test_that("`check_missing_files()` able to return correct missing files", { existing.dbfile = data.frame() ) - # Print the structure of `res` for debugging - str(res) expect_equal(length(res), 2) expect_true(is.list(res[[1]]))