-
Notifications
You must be signed in to change notification settings - Fork 4
/
calculate_gene_expression.py
executable file
·89 lines (77 loc) · 3 KB
/
calculate_gene_expression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import time
import sys
import csv
import getopt
from mpi4py import MPI
import pandas as pd
t0 = time.time()
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
nprocs = comm.Get_size()
WORKING_DIR = os.path.join(os.path.realpath('.'), 'genes', 'expressions')
def filterGeneExpressionFile (TISSUE_NAME, data):
tissue_samples_file = os.path.join(WORKING_DIR, TISSUE_NAME + "_samples.csv")
df = pd.read_csv(tissue_samples_file, sep=',', header=0, usecols=['SAMPLE_ID'])
count = 0
expressionSum= 0
for row_gene in data[1:]: # ignore first line, it is CSV header
GENE_SAMPLE = int(row_gene[0])
result = df[df["SAMPLE_ID"] == GENE_SAMPLE]
if not result.empty:
#print ("Process {}: ".format(rank), row_gene)
count += 1
expressionSum += float(row_gene[4])
return count, expressionSum
def main(argv):
try:
opts, args = getopt.getopt(argv, "hg:t:o:", ["gene", "tissue="])
except getopt.GetoptError:
print ('-g <gene Uniprot ID or file path> -t <tissue name> -o <output file>')
sys.exit()
for opt, arg in opts:
if opt == '-h':
print ('-g <gene Uniprot ID or file path> -t <tissue name> -o <output file>')
sys.exit()
elif opt in ("-g", "--gene"):
GENE_NAME = arg
elif opt in ("-t", "--tissue"):
TISSUE_NAME = arg
elif opt in ("-o", "--output"):
OUTPUT_FILE = arg
# filter gene expressions from CSV parts for only selected tissue samples on N cores
if rank == 0:
data = []
try:
for i in range(1, nprocs + 1):
csv_file = os.path.join(WORKING_DIR, GENE_NAME + '_part_{}.csv'.format(i))
reader = csv.reader(open(csv_file), delimiter=",")
data.append([row for row in reader])
except:
print ("Z-score calculation for %s failed." % GENE_NAME)
else:
data = None
try:
data = comm.scatter(data, root=0)
expression_count, zscore_sum = filterGeneExpressionFile(TISSUE_NAME, data)
expression_count= comm.gather(expression_count, root=0)
zscore_sum = comm.gather(zscore_sum, root=0)
except:
print ("Error in MPI scatter or gather process")
sys.exit()
if rank == 0:
try:
average_zscore = sum(zscore_sum)/sum(expression_count)
print (GENE_NAME)
print('count number is {}'.format(sum(expression_count)))
print('Z-score average is {}'.format(average_zscore))
print('calculated in {:.3f} sec'.format(time.time() - t0))
with open(OUTPUT_FILE, 'a+') as output_file:
writer = csv.writer(output_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow([GENE_NAME, average_zscore])
return average_zscore
except:
print ("Error writing expression scores to CSV")
sys.exit(0)
if __name__ == "__main__":
main(sys.argv[1:])