parameters.conf

[FilePath]
# the path of SMRT-Analysis in each worker node
# for now, we are using SMRT-Analysis V2.3.0 (smrtanalysis_2.3.0.140936)
SMRT_ANALYSIS_HOME = /home/hadoop/smrtanalysis

# directory for storing the temp data in the master node and each worker node
# Note that the dir must have enough space.
TEMP_OUTPUT_FOLDER = /tmp/basemods_spark_data

# choose to put the cell data into your master node or HDFS
# "MASTER" or "HDFS" or "SHARED_FOLDER"
DATA_SAVE_MODE = SHARED_FOLDER

# the reference file directory in your master node or HDFS
REFERENCE_DIR = /home/hadoop/workspace/pyworkspace/basemods_spark/data/lambda/sequence
# REFERENCE_DIR = hdfs://127.0.0.1:9000/data/pacbio/lambda_v210
REF_FILENAME = lambda.fasta
# please put .sa file in the same directory (REFERENCE_DIR) of your reference file (REF_FILENAME)
# if there is no .sa file, assign "None" to REF_SA_FILENAME
# REF_SA_FILENAME = lambda.fasta.sa
REF_SA_FILENAME = None

# the smrt cell data directory in your master node or in HDFS
CELL_DATA_DIR = /home/hadoop/workspace/pyworkspace/basemods_spark/data/lambda_v210
# CELL_DATA_DIR = hdfs://127.0.0.1:9000/data/pacbio/lambda_v210


[PipelineArgs]
# the location of the 'hdfs' shell script
HDFS_CMD = /usr/local/hadoop/bin/hdfs

# the number of processors you allow SMRT-Analysis to use in each worker node
# It's ok to set PROC_NUM to 39 if each worker node has 40 processors.
PROC_NUM = 3

# the folds of each bax.h5 file you want to split to
BAXH5_FOLDS = 1

# REF_CHUNKS_FACTOR has been replaced by spark_task_cpus
# It is better that REF_CHUNKS_FACTOR is no greater than CORE_NUM.
# REF_CHUNKS_FACTOR = 2

# for now READS_TRIM_STRATEGY is disabled
# strategy for trimming reads in repeats region
# "random" or "mapqv"
READS_TRIM_STRATEGY = random

# maxCoverage in ipdSummary.py
IPDMAXCOVERAGE = 250

# methylation types to be identified, for now there are three kinds: "m6A,m4C,m5C_TET"
# Use ',' as delimiter.
METHYLATION_TYPES = m6A,m4C

# whether to write IPD value to file or not
# "YES" or "NO"
GET_IPD_FROM_BASH5 = NO
GET_IPD_FROM_CMPH5 = NO


# master node's info for scp,
# [MasterNodeInfo] is only useful when DATA_SAVE_MODE = MASTER
[MasterNodeInfo]
# hostname or IP of your master node
HOST = 127.0.0.1

# host port
HOSTPORT = 22

# user name to access your master node
USERNAME = hadoop

# user password
USERPASSWD = 123


[SparkConfiguration]
# should be no greater than the memory of a worker node
spark_executor_memory = 4g

# Number of cores (identical to virtual cores/cpu processors) to allocate for each task.
# 2, 4 or 5?
spark_task_cpus = 1

# default value in Spark Configuration is 0.6
spark_memory_fraction = 0.6

# default value in Spark Configuration is 0.5
spark_memory_storageFraction = 0.5

## only for Spark on YARN mode------------------------------------
# for now, this parameter should be equal to the number of worker nodes
# e.g. Suppose you have 5 worker nodes, set the number of executor instances to 5.
spark_executor_instances = 1

# this parameter should better be equal to (the number of cpu cores each worker node has - 1).
# e.g. Suppose you have 64 cores (identical to virtual cores/cpu processors) in each worker node,
# set the number of executor cores to 63.
# OR, set it to the number of "yarn.nodemanager.resource.cpu-vcores"/"yarn.scheduler.maximum-allocation-vcores"
# in $HADOOP_HOME/etc/hadoop/yarn-site.xml.
spark_executor_cores = 4
## ---------------------------------------------------------------