-
Notifications
You must be signed in to change notification settings - Fork 0
/
parameters.conf
102 lines (77 loc) · 3.39 KB
/
parameters.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
[FilePath]
# the path of SMRT-Analysis in each worker node
# for now, we are using SMRT-Analysis V2.3.0 (smrtanalysis_2.3.0.140936)
SMRT_ANALYSIS_HOME = /home/hadoop/smrtanalysis
# directory for storing the temp data in the master node and each worker node
# Note that the dir must have enough space.
TEMP_OUTPUT_FOLDER = /tmp/basemods_spark_data
# choose to put the cell data into your master node or HDFS
# "MASTER" or "HDFS" or "SHARED_FOLDER"
DATA_SAVE_MODE = SHARED_FOLDER
# the reference file directory in your master node or HDFS
REFERENCE_DIR = /home/hadoop/workspace/pyworkspace/basemods_spark/data/lambda/sequence
# REFERENCE_DIR = hdfs://127.0.0.1:9000/data/pacbio/lambda_v210
REF_FILENAME = lambda.fasta
# please put .sa file in the same directory (REFERENCE_DIR) of your reference file (REF_FILENAME)
# if there is no .sa file, assign "None" to REF_SA_FILENAME
# REF_SA_FILENAME = lambda.fasta.sa
REF_SA_FILENAME = None
# the smrt cell data directory in your master node or in HDFS
CELL_DATA_DIR = /home/hadoop/workspace/pyworkspace/basemods_spark/data/lambda_v210
# CELL_DATA_DIR = hdfs://127.0.0.1:9000/data/pacbio/lambda_v210
[PipelineArgs]
# the location of the 'hdfs' shell script
HDFS_CMD = /usr/local/hadoop/bin/hdfs
# the number of processors you allow SMRT-Analysis to use in each worker node
# It's ok to set PROC_NUM to 39 if each worker node has 40 processors.
PROC_NUM = 3
# the folds of each bax.h5 file you want to split to
BAXH5_FOLDS = 1
# REF_CHUNKS_FACTOR has been replaced by spark_task_cpus
# It is better that REF_CHUNKS_FACTOR is no greater than CORE_NUM.
# REF_CHUNKS_FACTOR = 2
# for now READS_TRIM_STRATEGY is disabled
# strategy for trimming reads in repeats region
# "random" or "mapqv"
READS_TRIM_STRATEGY = random
# maxCoverage in ipdSummary.py
IPDMAXCOVERAGE = 250
# methylation types to be identified, for now there are three kinds: "m6A,m4C,m5C_TET"
# Use ',' as delimiter.
METHYLATION_TYPES = m6A,m4C
# whether to write IPD value to file or not
# "YES" or "NO"
GET_IPD_FROM_BASH5 = NO
GET_IPD_FROM_CMPH5 = NO
# master node's info for scp,
# [MasterNodeInfo] is only useful when DATA_SAVE_MODE = MASTER
[MasterNodeInfo]
# hostname or IP of your master node
HOST = 127.0.0.1
# host port
HOSTPORT = 22
# user name to access your master node
USERNAME = hadoop
# user password
USERPASSWD = 123
[SparkConfiguration]
# should be no greater than the memory of a worker node
spark_executor_memory = 4g
# Number of cores (identical to virtual cores/cpu processors) to allocate for each task.
# 2, 4 or 5?
spark_task_cpus = 1
# default value in Spark Configuration is 0.6
spark_memory_fraction = 0.6
# default value in Spark Configuration is 0.5
spark_memory_storageFraction = 0.5
## only for Spark on YARN mode------------------------------------
# for now, this parameter should be equal to the number of worker nodes
# e.g. Suppose you have 5 worker nodes, set the number of executor instances to 5.
spark_executor_instances = 1
# this parameter should better be equal to (the number of cpu cores each worker node has - 1).
# e.g. Suppose you have 64 cores (identical to virtual cores/cpu processors) in each worker node,
# set the number of executor cores to 63.
# OR, set it to the number of "yarn.nodemanager.resource.cpu-vcores"/"yarn.scheduler.maximum-allocation-vcores"
# in $HADOOP_HOME/etc/hadoop/yarn-site.xml.
spark_executor_cores = 4
## ---------------------------------------------------------------