-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add some infinitives with clitics from a user who found some infiniti…
…ve tokenization errors stanfordnlp/stanza#1401
- Loading branch information
1 parent
b8d4e74
commit 6d83af4
Showing
3 changed files
with
298 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
# sent_id = 0 | ||
# text = juntarse. | ||
1-2 juntarse _ _ _ _ _ _ _ SpaceAfter=No | ||
1 juntar juntar VERB _ VerbForm=Inf 0 root _ _ | ||
2 se él PRON _ Case=Acc|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 1 | ||
# text = Juntarse. | ||
1-2 Juntarse _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Juntar juntar VERB _ VerbForm=Inf 0 root _ _ | ||
2 se él PRON _ Case=Acc|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 2 | ||
# text = Decírselo. | ||
1-3 Decírselo _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Decir decir VERB _ VerbForm=Inf 0 root _ _ | ||
2 se él PRON _ _ 1 _ _ _ | ||
3 lo él PRON _ _ 2 _ _ _ | ||
4 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 3 | ||
# text = decírselo. | ||
1-3 decírselo _ _ _ _ _ _ _ SpaceAfter=No | ||
1 decir decir VERB _ VerbForm=Inf 0 root _ _ | ||
2 se él PRON _ _ 1 _ _ _ | ||
3 lo él PRON _ _ 2 _ _ _ | ||
4 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 4 | ||
# text = Decírmelo. | ||
1-3 Decírmelo _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Decir decir VERB _ VerbForm=Inf 0 root _ _ | ||
2 me él PRON _ _ 1 _ _ _ | ||
3 lo él PRON _ _ 2 _ _ _ | ||
4 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 5 | ||
# text = decírmelo. | ||
1-3 decírmelo _ _ _ _ _ _ _ SpaceAfter=No | ||
1 decir decir VERB _ VerbForm=Inf 0 root _ _ | ||
2 me él PRON _ _ 1 _ _ _ | ||
3 lo él PRON _ _ 2 _ _ _ | ||
4 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 6 | ||
# text = Dárselo. | ||
1-3 Dárselo _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Dar dar VERB _ VerbForm=Inf 0 root _ _ | ||
2 se él PRON _ _ 1 _ _ _ | ||
3 lo él PRON _ _ 2 _ _ _ | ||
4 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 7 | ||
# text = dárselo. | ||
1-3 dárselo _ _ _ _ _ _ _ SpaceAfter=No | ||
1 dar dar VERB _ VerbForm=Inf 0 root _ _ | ||
2 se él PRON _ _ 1 _ _ _ | ||
3 lo él PRON _ _ 2 _ _ _ | ||
4 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 8 | ||
# text = atarlos. | ||
1-2 atarlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 atar atar VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 9 | ||
# text = Atarlos. | ||
1-2 Atarlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Atar atar VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 10 | ||
# text = besarlos. | ||
1-2 besarlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 besar besar VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 11 | ||
# text = Besarlos. | ||
1-2 Besarlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Besar besar VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 12 | ||
# text = compartirlos. | ||
1-2 compartirlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 compartir compartir VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 13 | ||
# text = Compartirlos. | ||
1-2 Compartirlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Compartir compartir VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 14 | ||
# text = decirlos. | ||
1-2 decirlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 decir decir VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 15 | ||
# text = Decirlos. | ||
1-2 Decirlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Decir decir VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 16 | ||
# text = haberlos. | ||
1-2 haberlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 haber haber VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 17 | ||
# text = Haberlos. | ||
1-2 Haberlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Haber haber VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 18 | ||
# text = hacerlos. | ||
1-2 hacerlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 hacer hacer VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 19 | ||
# text = Hacerlos. | ||
1-2 Hacerlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Hacer hacer VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 20 | ||
# text = invadirlos. | ||
1-2 invadirlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 invadir invadir VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 21 | ||
# text = Invadirlos. | ||
1-2 Invadirlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Invadir invadir VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 22 | ||
# text = llamarlos. | ||
1-2 llamarlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 llamar llamar VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 23 | ||
# text = Llamarlos. | ||
1-2 Llamarlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Llamar llamar VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 24 | ||
# text = saberlos. | ||
1-2 saberlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 saber saber VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 25 | ||
# text = Saberlos. | ||
1-2 Saberlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Saber saber VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 26 | ||
# text = tenerlos. | ||
1-2 tenerlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 tener tener VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 27 | ||
# text = Tenerlos. | ||
1-2 Tenerlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Tener tener VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 28 | ||
# text = usarlos. | ||
1-2 usarlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 usar usar VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 29 | ||
# text = Usarlos. | ||
1-2 Usarlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Usar usar VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 30 | ||
# text = verlos. | ||
1-2 verlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 ver ver VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
||
# sent_id = 31 | ||
# text = Verlos. | ||
1-2 Verlos _ _ _ _ _ _ _ SpaceAfter=No | ||
1 Ver ver VERB _ VerbForm=Inf 0 root _ _ | ||
2 los él PRON _ _ 1 obj _ _ | ||
3 . . PUNCT _ PunctType=Peri 1 punct _ _ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
""" | ||
Goal: | ||
Add infinitives with pronouns on the end to the Spanish combined dataset | ||
verlos | ||
hacerlos | ||
haberlos | ||
etc etc | ||
Starting from a list in this issue: | ||
https://github.com/stanfordnlp/stanza/issues/1401 | ||
""" | ||
|
||
from stanza.utils.conll import CoNLL | ||
|
||
starter = CoNLL.conll2doc("handpicked.mwt") | ||
|
||
VERBS = [ | ||
"atar", | ||
"besar", | ||
"compartir", | ||
"decir", | ||
"haber", | ||
"hacer", | ||
"invadir", | ||
"llamar", | ||
"saber", | ||
"tener", | ||
"usar", | ||
"ver", | ||
] | ||
|
||
sent_id = int(starter.sentences[-1].sent_id) | ||
|
||
new_sentences = [] | ||
for verb in VERBS: | ||
sent_id += 1 | ||
mwt = ["1-2", "%slos" % verb, "_", "_", "_", "_", "_", "_", "_", "SpaceAfter=No"] | ||
inf = ["1", verb, verb, "VERB", "_", "VerbForm=Inf", "0", "root", "_", "_"] | ||
sentence = [ | ||
"# sent_id = %d" % sent_id, | ||
"# text = %slos." % verb, | ||
"\t".join(mwt), | ||
"\t".join(inf), | ||
"2 los él PRON _ _ 1 obj _ _", | ||
"3 . . PUNCT _ PunctType=Peri 1 punct _ _" | ||
] | ||
new_sentences.append("\n".join(sentence)) | ||
|
||
sent_id += 1 | ||
Verb = verb[0].upper() + verb[1:] | ||
sentence[0] = "# sent_id = %d" % sent_id | ||
sentence[1] = "# text = %slos." % Verb | ||
mwt[1] = Verb + "los" | ||
sentence[2] = "\t".join(mwt) | ||
inf[1] = Verb | ||
sentence[3] = "\t".join(inf) | ||
new_sentences.append("\n".join(sentence)) | ||
|
||
print("{:C}".format(starter)) | ||
print() | ||
|
||
for sentence in new_sentences: | ||
print(sentence) | ||
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters