-
Notifications
You must be signed in to change notification settings - Fork 5
/
tira_evaluation.py
executable file
·164 lines (124 loc) · 6.08 KB
/
tira_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/env python
from argparse import ArgumentParser, Namespace
import json
import os
import pickle
import codecs
import time
import lysfastparse.utils
import lysfastparse.bcovington.utils_bcovington
import tempfile
import yaml
import subprocess
import sys
LCODE="lcode"
TCODE="tcode"
GOLDFILE="goldfile"
OUTFILE="outfile"
PSEGMORFILE="psegmorfile"
RAWFILE="rawfile"
NAME_TREEBANK="name"
R_RAW = "raw"
R_UDPIPE = "udpipe"
YAML_UDPIPE="udpipe"
def get_models_dict(path_models):
d = {}
files = [(path_models+os.sep+f,f) for f in os.listdir(path_models)]
for path,name in files:
name_split =name.split(".")
l,t = name_split[0],name_split[1]
if l not in d: d[l] = {}
if t not in d[l]: d[l][t] = {"model":None,
"params":None}
if name.endswith(".model"):
d[l][t]["model"] = path
if name.endswith(".pickle"):
d[l][t]["params"] = path
return d
def select_model(lcode, tcode, dict_models):
dict_lan_pref = {'bxr':'0','hsb':'0','kmr':'0','sme':'0','el': '0', 'en': '0', 'zh': '0', 'vi': '0', 'ca': '0', 'it': '0', 'eu': '0', 'ar': '0', 'ga': '0', 'cs': '0', 'et': '0', 'gl': '0', 'id': '0', 'es': 'ancora', 'ru': 'syntagrus', 'nl': '0', 'pt': 'br', 'no': 'bokmaal', 'tr': '0', 'lv': '0', 'grc': 'proiel', 'got': '0', 'ro': '0', 'pl': '0', 'fr': '0', 'bg': '0', 'hr': '0', 'de': '0', 'hu': '0', 'fa': '0', 'hi': '0', 'fi': 'ftb', 'da': '0', 'ja': '0', 'he': '0', 'kk': '0', 'la': 'ittb', 'ko': '0', 'sv': '0', 'ur': '0', 'sk': '0', 'cu': '0', 'uk': '0', 'sl': '0', 'ug': '0'}
try:
#If we know the lang and treebank code
return dict_models[lcode][tcode]["model"],dict_models[lcode][tcode]["params"]
except KeyError:
try:
#If we know the lang but not the treebank code
treebank = dict_lan_pref[lcode]
return dict_models[lcode][treebank]["model"],dict_models[lcode][treebank]["params"]
except KeyError:
#We do not know the lang neither the treebank code
if "en" in dict_models and "0" in dict_models["en"]:
return dict_models["en"]["0"]["model"],dict_models["en"]["0"]["params"]
else:
return None,None
def get_udpipe_models(path_models):
d = {}
files = [(path_models+os.sep+f,f) for f in os.listdir(path_models)]
for path,name in files:
name_udpipe_model =name.split("-ud-")[0].lower()
d[name_udpipe_model] = path
return d
def select_udpipe_model(name_treebank,dict_udpipe_models):
try:
return dict_udpipe_models[name_treebank]
except KeyError:
try:
return dict_udpipe_models[name_treebank.split("-")[0]]
except KeyError:
return dict_udpipe_models["english"]
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument("-c", dest="c", help="Input dataset",metavar="FILE")
parser.add_argument("-r", dest="r",help="Input run [raw|conllu]", type=str)
parser.add_argument("-o", dest="o",help="Output directory",metavar="FILE")
parser.add_argument("-m", dest="m", help="Models directory",metavar="FILE")
parser.add_argument("-e", dest="e", help="Embeddings directory",metavar="FILE")
parser.add_argument("-um", dest="um",help="UDpipe models direcotry", metavar="FILE")
parser.add_argument("--dynet-mem", dest="dynet_mem", help="It is needed to specify this parameter")
parser.add_argument("-conf", dest="conf")
args = parser.parse_args()
print "args", args
config = yaml.safe_load(open(args.conf))
with open(args.c+os.sep+"metadata.json") as data_file:
metadata_datasets = json.load(data_file)
dict_models = get_models_dict(args.m)
for metadata in metadata_datasets:
path_model, path_params = select_model(metadata[LCODE], metadata[TCODE], dict_models)
if path_model is None: continue
path_udpipe_bin = "none"
path_udpipe_model = "none"
name_extrn_emb = path_model.rsplit("/",1)[1].split(".")[2]
print "Processing file", metadata[PSEGMORFILE]
print "Model:",path_model
print "Params:",path_params
print "POS/FEATS embeddings: ",name_extrn_emb
path_pos_embeddings = os.sep.join([args.e,"UD_POS_embeddings",name_extrn_emb])
path_feats_embeddings = os.sep.join([args.e,"UD_FEATS_embeddings",name_extrn_emb])
path_embeddings = os.sep.join([args.e,"word-embeddings-conll17",metadata[LCODE]+".vectors"])
path_output = os.sep.join([args.o,metadata[OUTFILE]])
if args.r == "conllu":
print "Parsing the segmor output from UDPipe"
path_input = os.sep.join([args.c,metadata[PSEGMORFILE]])
elif args.r == "raw":
print "Using the raw file with..."
dict_udpipe_models = get_udpipe_models(args.um)
path_input = os.sep.join([args.c,metadata[RAWFILE]])
path_udpipe_bin = config[YAML_UDPIPE]
path_udpipe_model = select_udpipe_model(name_extrn_emb.replace("UD_","").lower(),dict_udpipe_models)
print "path_udpipe_model", path_udpipe_model
else:
raise NotImplementedError
print "Path output", path_output
if os.path.exists(path_output):
print path_output,"has been previously computed"
elif not os.path.exists(path_model):
print path_output,"there is no", path_model," model"
else:
command = " ".join(["python run_model.py", "-p",path_params,"-m",path_model,
"-o",path_output, "-epe", path_pos_embeddings, "-efe", path_feats_embeddings,
"-ewe", path_embeddings,
"-r",args.r, "-i",path_input,
"--dynet-mem", args.dynet_mem,
"-udpipe_bin", path_udpipe_bin,
"-udpipe_model", path_udpipe_model])
os.system(command)