Merge pull request #22 from rscherrer/develop

Develop
rscherrer · Feb 16, 2023 · bac6057 · bac6057
2 parents 5bdf8dc + 7abe918
commit bac6057
Show file tree

Hide file tree

Showing 62 changed files with 603 additions and 486 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,13 @@
+# Potential folders within
+peregrine/
+analysis/
+ms/
+data/
+approx/
+extra/
+mating/
+speciomer/
+
 ##### Windows
 # Windows thumbnail cache files
 Thumbs.db
@@ -1038,3 +1048,4 @@ modules.order
 Module.symvers
 Mkfile.old
 dkms.conf
+.Rproj.user
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "vcpkg"]
+	path = vcpkg
+	url = https://github.com/microsoft/vcpkg
diff --git a/.travis.yml b/.travis.yml
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,29 @@
+# ./CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.16)
+
+set(CMAKE_TOOLCHAIN_FILE "${CMAKE_SOURCE_DIR}/vcpkg/scripts/buildsystems/vcpkg.cmake")
+if (WIN32)
+    set(VCPKG_TARGET_TRIPLET x64-windows)
+endif()
+
+project(speciome)
+
+find_package(Git REQUIRED)
+execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --remote
+                WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+                RESULT_VARIABLE GIT_SUBMOD_RESULT)
+if (NOT GIT_SUBMOD_RESULT EQUAL "0")
+    message(FATAL_ERROR "git submodule update --init failed with ${GIT_SUBMOD_RESULT}, please checkout submodules")
+endif()
+
+# we want to use the newest and boldest c++ standard
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED)
+
+# boilerplate
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+set(CMAKE_INSTALL_PREFIX ${CMAKE_SOURCE_DIR})
+
+add_subdirectory(src)
+add_subdirectory(tests)
diff --git a/CMakeLists_devel.txt b/CMakeLists_devel.txt
@@ -0,0 +1,29 @@
+# ./CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.16)
+
+set(CMAKE_TOOLCHAIN_FILE "${CMAKE_SOURCE_DIR}/vcpkg/scripts/buildsystems/vcpkg.cmake")
+if (WIN32)
+    set(VCPKG_TARGET_TRIPLET x64-windows)
+endif()
+
+project(speciome)
+
+find_package(Git REQUIRED)
+execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init --remote
+                WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+                RESULT_VARIABLE GIT_SUBMOD_RESULT)
+if (NOT GIT_SUBMOD_RESULT EQUAL "0")
+    message(FATAL_ERROR "git submodule update --init failed with ${GIT_SUBMOD_RESULT}, please checkout submodules")
+endif()
+
+# we want to use the newest and boldest c++ standard
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED)
+
+# boilerplate
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+set(CMAKE_INSTALL_PREFIX ${CMAKE_SOURCE_DIR})
+
+add_subdirectory(src)
+add_subdirectory(tests)
diff --git a/CMakeLists_user.txt b/CMakeLists_user.txt
@@ -0,0 +1,15 @@
+# ./CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.16)
+
+project(speciome)
+
+# we want to use the newest and boldest c++ standard
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED)
+
+# boilerplate
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+set(CMAKE_INSTALL_PREFIX ${CMAKE_SOURCE_DIR})
+
+add_subdirectory(src)
diff --git a/README.md b/README.md
diff --git a/ci/do_oclint b/ci/do_oclint
diff --git a/ci/get_code_cov b/ci/get_code_cov
diff --git a/ci/install_oclint b/ci/install_oclint
diff --git a/ci/pics/Codecov.png b/ci/pics/Codecov.png
diff --git a/ci/pics/TravisCI.png b/ci/pics/TravisCI.png
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
@@ -0,0 +1,46 @@
+## Genetic architecture file
+
+### Example
+
+This is a file for an architecture with 3 loci coding for each trait, and where each network is a full graph (all three loci are connected in a triangle-shaped network). There are three chromosomes of equal size. All effect sizes, dominance coefficients and interaction weights are 0.1.
+
+```
+--parameters--
+nvertices 3 3 3 
+nedges 3 3 3 
+nchrom 3
+
+--architecture--
+chromosomes 0.333333 0.666667 1 
+traits 0 0 0 1 1 1 2 2 2 
+locations 0.01 0.03 0.4 0.5 0.65 0.7 0.88 0.99 0.999
+effects 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 
+dominances 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 
+from 0 0 0 1
+to 0 1 2 2
+weights 0 0.1 0.1 0.1 
+from 1 3 3 4
+to 1 4 5 5
+weights 1 0.1 0.1 0.1 
+from 2 6 6 7
+to 2 7 8 8
+weights 2 0.1 0.1 0.1 
+```
+
+### Rules
+
+* The architecture file is a text file organized in two sections, each delimited by the header "--parameters--" or "--architecture--". Its name must be "architecture.txt" and it must be present in the working directory in order to be read.
+
+* Under the "--parameters--" header are expected the names and values of parameters (or rather hyperparameters) that are important for the set-up of the genetic architecture. Those are `nchrom`, `nvertices` and `nedges`, and should be provided just like in a parameter file (see above). They will be used to read the right numer of entries from the following section, and to override the parameters in the model once the architecture is loaded.
+
+* Under the "--architecture--" header are expected the names and values of the actual architecture fields, which are essentially lists of parameters. There are three kinds of fields, that differ in how many values they take: chromosome-wise fields, locus-wise fields and edge-wise fields.
+
+  * Chromosome-wise field: `chromosomes`, consisting of the end location of each chromosome (between 0 and 1, each representing the two ends of the genomes). One value per chromosome.
+
+  * Locus-wise fields: `traits`, `locations`, `effects` and `dominances` are the encoded traits (0, 1, or 2), genomic locations (between 0 and 1), additive effect sizes and dominance coefficients of each locus in the genome, respectively. One value per locus.
+
+  * Edge-wise fields: `from`, `to` and `weights` are respectively the indices of the first and second partner, and the interaction weight, of each edge. One value per edge. 
+
+* Each field should be followed by the values it takes (e.g. `chromosomes 0.333333 0.666667 1`, `locations 0.01 0.02 0.45 0.6 0.8 0.9`), but if the field is an edge-wise field the values should be preceded by the index of the trait of the network that field belongs to (e.g. `weights 0 0.56 0.37 -0.45 0.67 0.1 -0.89`, where 0 in second position refers to the ecological trait). Fields and values are all separated by spaces.
+
+A randomly-generated genetic architecture can be saved as a ready-to-use architecture file if `archsave` is set to 1 (in this case the architecture will be saved in the file `architecture.txt`). Note that architecture files saved by the program when `archsave` is 1 contain all the parameters of the model that generated the architecture. For any other architecture file supplied, you need to make sure that the aforementioned expected structure is respected and that all the necessary fields are present. Also note that in general it is a good idea to save the genetic architecture used if you are going to save and analyze genetic data from the simulation, as the output variables do not contain details about the architecture, which might make them difficult to interpret otherwise.
diff --git a/docs/BUILD.md b/docs/BUILD.md
@@ -0,0 +1,51 @@
+## Build (developer)
+
+This uses [vcpkg](https://github.com/microsoft/vcpkg) to install the dependencies needed to build as developer:
+
+* [Boost.Test](https://github.com/boostorg/test) (latest version) for unit testing
+
+### Linux, MacOS
+
+```shell
+git clone [email protected]:rscherrer/speciome.git
+cd speciome
+cp CMakeLists_devel.txt CMakeLists.txt # developer configuration
+git submodule add https://github.com/microsoft/vcpkg
+git submodule update --init --remote
+mkdir build && cd build
+cmake ..
+cmake --build .
+```
+
+Executables for tests are built in `bin/tests/`.
+
+### Windows
+
+```cmd
+git clone [email protected]:rscherrer/speciome.git
+cd speciome
+copy CMakeLists_devel.txt CMakeLists.txt :: developer configuration
+git submodule add https://github.com/microsoft/vcpkg
+git submodule update --init --remote
+mkdir build
+cd build
+cmake ..
+cmake --build . --config Release
+```
+
+Executables for tests are built in `bin/tests/`.
+
+### IDEs
+
+```shell
+git clone [email protected]:rscherrer/speciome.git
+cd speciome
+cp CMakeLists_devel.txt CMakeLists.txt # developer configuration
+git submodule update --init --recursive
+mkdir build
+cd build
+# Generate VisualStudio project files
+cmake -G "Visual Studio 17 2022" -A x64 ..
+# Generate Xcode project files (Xcode must be installed)
+cmake -G Xcode    
+```
diff --git a/docs/CMAKE.md b/docs/CMAKE.md
@@ -0,0 +1,14 @@
+## Install CMake
+
+### Linux, MacOS
+
+On Linux/MacOs, use:
+
+```bash
+sudo apt install cmake # replace 'apt' with your distro package manager
+```
+
+### Windows
+
+[Download](https://github.com/Kitware/CMake/releases/download/v3.23.0/cmake-3.23.0-windows-x86_64.msi) and make sure to select the option "Add CMake to the system PATH for the current user" when asked by the installer.
+
diff --git a/docs/GENOMES.md b/docs/GENOMES.md
@@ -0,0 +1,13 @@
+## Saving genomes
+
+If you set `gensave 1` in addition to `datsave 1` two things will be saved every `tsave` generations: 
+
+| File | Variable |
+|--|--|
+| `individual_whole_genomes.dat` | The whole genome of each individual.|
+| `individual_locus_genvalues.dat` | The genetic value of each locus in each individual. |
+
+**Note:** To save space, we use the fact that alleles are binary (0 or 1). Each value in a full genome is an allele at a specific position along one of the two haplotypes of an individual. Therefore, a genome contains twice as many values as there are loci (the organism is diploid). Each value is either 0 or 1 (the two possible alleles). Haplotypes are saved in turns, such that the first N values are all alleles of the first haplotype and the next N values are all alleles of the second haplotype, where N is the number of loci. This does not mean that each saved individual genome is exactly 2N values long, though. In order to save space for this large amount of data, individual genomes are first split into blocks of 64 bits, and each block is converted into a 64bit integer, which is then saved as binary. Therefore, the output file `individual_whole_genomes.dat` must be interpreted on a bit-wise basis in order to retrieve the actual alleles of the individual (i.e. reading it as 64bit integers will show integer-equivalents of chunks of 64 alleles). This also means that for each individual, a multiple of 64 bits will be written to the file, even if 2N alleles is not necessarily a multiple of 64. In other words, for each individual 2N bits will be written to file, and the remaining part of the last 64bit-chunk will be filled with zeros. (You do not have to worry about all that, the R package [speciomer](https://github.com/rscherrer/speciomer) takes care of it.)
+
+**Important:** whole individual genomes take a lot of space. For this reason we advise against saving them too often. To save regular variables frequently and whole genomes less frequently, we recommend running the same simulation twice with different values of `tsave` and making sure that the `seed` is the same. (The seed used by a simulation where no seed was provided can be retrieved in the `paramlog.txt` file, provided that `parsave` is set to 1, or in the `architecture.txt` file provided that `gensave` is set to 1.)
+