-
Notifications
You must be signed in to change notification settings - Fork 4
/
config.yaml
137 lines (122 loc) · 4.93 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Input Settings: initialize base input folder names,
# and dataset collections
input_settings:
# Base input directory
input_dir: "inputs"
# "datasets" denotes a list of experiments to run, each with the folling specified (a * indicates required options):
# *net_version: name to give this network. Will be used to organize output files
#
# net_files: list of one or more network files,
# or a .mat file containing multiple networks.
# multi_net must be set to True if multiple networks are to be combined.
# the path should be from <input_dir>/<net_version>
#
# string_net_files: a list of full STRING files, meaning all 16, tab-delimited columns are present
# the path should be from <input_dir>/<net_version>
#
# *exp_name: the name to give the experiment to run
#
# *pos_neg_file: a 3 column, tab-delimited file with two lines per term.
# Col1: term, Col2: 1 or -1 denoting positive or negative, and a comma-separated list of proteins
#
# only_functions_file: a file containing the GO term IDs to run.
#
# taxon_file: a tab-delimited file indicating the NCBI taxononmy ID (2nd col) for each gene/UniProt ID (1st col).
# Necessary to perform Leave-One-Species-Out (LOSO) evaluation.
#
# only_taxon_file: a file containing the taxon IDs to evaluate. Also used to get the names of the species
#
# pos_neg_file_eval: same as the pos_neg_file, only these annotations are used to evaluate.
# Some use cases: temporal holdout, or recovering annotations of another evidence code
#
#
datasets:
- net_version: "2018_06-seq-sim-e0_1"
net_files: ["2018_06-seq-sim-e0_1-net.txt"]
#string_net_files: [""]
exp_name: "expc-bp"
pos_neg_file: "pos-neg/expc/expc-bp.tsv"
#only_functions_file: ""
# if passing in multiple networks or one or more 'string_net_files',
# set multi_net to True and use these settings
multi_net: False
net_settings:
# method to combine the networks. either swsn, gmw, or unweighted
weight_method: 'swsn'
# if full STRING files are given, 'string_nets' and 'string_cutoff' are required
## either core, nontransferred, all, or a comma-separated list of the string networks, or a combination of the above. Default: all
#string_nets: 'core'
## cutoff of the STRING scores. Default: 150
#string_cutoff: 400
# If a .mat file is given, 'net_names_file' and 'node_ids_file' are required.
net_names_file: ""
node_ids_file: ""
# the outputs will follow this structure:
# outputs/<net_version>/<exp_name>/<alg_name>/output_files
output_settings:
output_dir: "outputs/"
# Algorithms to run along with their settings.
# All combinations of parameters will be run.
algs:
# local has no parameters
local:
should_run: [False]
fastsinksource:
should_run: [True]
alpha: [0.95]
eps: [0]
max_iters: [10]
# sinksource-bounds
# TODO finish refactoring
sinksource-bounds:
should_run: [False]
rank_all: [False]
rank_pos_neg: [False]
compare_ranks: [False]
alpha: [0.95]
max_iters: [10]
# genemania
genemania:
should_run: [False]
tol: [1e-05]
# birgrank
birgrank:
should_run: [False]
# path to the OBO file from the input_dir
obo_file: ["inputs/goa/2017_09/2017-09-26-go.obo"]
alpha: [.5]
lambda: [.5]
mu: [.5]
theta: [.5]
eps: [1e-04]
max_iters: [1000]
# aptrank
aptrank:
should_run: [False]
# path to the OBO file from the input_dir
obo_file: ["inputs/goa/2017_09/2017-09-26-go.obo"]
lambda: [.5]
k: [8]
s: [5]
t: [0.5]
diff_type: ["twoway"]
#fastsinksourceplus:
# alpha: 0.9
# eps: 0.0001
# max_iters: 1000
# # this option isn't needed if alpha is < 1
# sinksourceplus_lambda: 0
# Evaluation settings for CV or LOSO can be set here
# TODO these should probably be unique per dataset
eval_settings:
### LOSO parameters
# Don't leave out annotations when running the algorithms.
# Can only be used in conjunction with a pos_neg_file_eval
keep_ann: False
# Minimum number of annotations for each term in the "left-out" species to test. Default: 10
num_test_cutoff: 10
# if --pos-neg-file-eval is given and --keep-ann is False, only evaluate terms that have at least 2% of annotations.
# Useful to speed-up processing for term-based algorithms
eval_goterms_with_left_out_only: False
# Postfix to append to output file. Useful if running multiple in parallel. TODO figure out how to automatically combine the multiple output files
postfix: ""