forked from emreg00/toolbox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_sider_v4.py
72 lines (61 loc) · 2.61 KB
/
parse_sider_v4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os, cPickle, gzip
def main():
base_dir = "/home/emre/data/sider"
indication_file = base_dir + "meddra_all_label_indications.tsv.gz"
side_effect_file = base_dir + "meddra_all_label_se.tsv.gz"
dump_file = base_dir + "sider.pcl"
pubchem_to_indications, pubchem_to_side_effects = get_sider_info(indication_file, side_effect_file, dump_file)
return
def get_sider_info(indication_file, side_effect_file, dump_file):
if os.path.exists(dump_file):
pubchem_to_indications, pubchem_to_side_effects = cPickle.load(open(dump_file))
else:
pubchem_to_umls_cids, pubchem_to_side_effects = parse_side_effects(side_effect_file)
pubchem_to_umls_cids, pubchem_to_indications = parse_indications(indication_file)
cPickle.dump((pubchem_to_indications, pubchem_to_side_effects), open(dump_file, 'w'))
return pubchem_to_indications, pubchem_to_side_effects
def parse_side_effects(side_effect_file):
f = gzip.open(side_effect_file)
pubchem_to_umls_cids = {}
pubchem_to_side_effects = {}
#cid_to_descriptions = {}
for line in f.readlines():
words = line.strip("\n").split("\t")
# Get Pubchem id (Strip "CID")
cid_flat = "%s" % (abs(int(words[1][3:])) - 100000000)
cid_specific = "%s" % abs(int(words[2][3:]))
try:
term_type, term_cid, term_name = words[4:]
except:
print line
print words
term_type, term_cid, term_name = words[4:]
if term_type != "PT":
continue
pubchem_to_umls_cids.setdefault(cid_flat, set()).add(term_cid)
pubchem_to_umls_cids.setdefault(cid_specific, set()).add(term_cid)
#cid_to_descriptions.setdefault(term_cid, set()).add(term_name.lower())
pubchem_to_side_effects.setdefault(cid_flat, set()).add(term_name)
pubchem_to_side_effects.setdefault(cid_specific, set()).add(term_name)
f.close()
return pubchem_to_umls_cids, pubchem_to_side_effects
def parse_indications(indication_file):
f = gzip.open(indication_file)
pubchem_to_umls_cids = {}
pubchem_to_indications = {}
for line in f.readlines():
words = line.strip("\n").split("\t")
# Get Pubchem id (Strip "CID")
cid_flat = "%s" % (abs(int(words[1][3:])) - 100000000)
cid_specific = "%s" % abs(int(words[2][3:]))
term_type, term_cid, term_name = words[6:]
if term_type != "PT":
continue
pubchem_to_umls_cids.setdefault(cid_flat, set()).add(term_cid)
pubchem_to_umls_cids.setdefault(cid_specific, set()).add(term_cid)
pubchem_to_indications.setdefault(cid_flat, set()).add(term_name)
pubchem_to_indications.setdefault(cid_specific, set()).add(term_name)
f.close()
return pubchem_to_umls_cids, pubchem_to_indications
if __name__ == "__main__":
main()