-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.cpp
126 lines (121 loc) · 4.7 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#include "Tokenizers.h"
#include <vector>
#include <fstream>
#include <iostream>
#include <cstdlib>
struct CommandLineArgs {
int handle;
unsigned long long int vocab_size;
std::string filename;
};
std::list<std::string> load_text(std::string filename) {
std::list<std::string> texts;
std::ifstream input(filename);
std::string lineBuffer;
while (std::getline(input, lineBuffer)) {
texts.push_back(lineBuffer);
}
return texts;
};
void print_table(std::list<std::string> vocab_list, unsigned long int word_size) {
auto vocab_front = vocab_list.cbegin();
for (int i=0; i < 8 + word_size; i++) std::cout << "-";
std::cout << std::endl;
for (int i=0; i < 5; i++) {
std::cout << "| " << i << " | " << *vocab_front;
for (int ll=0; ll < word_size-vocab_front->length(); ll++) std::cout << " ";
std::cout << " |" << std::endl;
std::advance(vocab_front, i);
}
for (int i=0; i < 8 + word_size; i++) std::cout << "-";
std::cout << std::endl;
};
CommandLineArgs handle_cli(int argc, char *argv[]) {
unsigned long long int vocab_size = 0;
std::string filename = "./readme.txt";
int handle = 0;
if (argc == 1) return CommandLineArgs{handle, vocab_size, filename};
if (argc > 1) {
std::string argument = argv[1];
if (argument == "--help" || argument == "-h") {
std::cout << "Usage: TokenizerTest [options]" << std::endl;
std::cout << " -h/--help: Displays this help." << std::endl;
std::cout << " -i/--input-corpus: Set a filename for a corpus." << std::endl;
handle = -1;
}
for (int ii=0; ii<argc; ii++) {
std::string argument = argv[ii];
if (argument == "--input-corpus" || argument == "-i") {
handle = 1;
filename = argv[ii+1];
}
}
if (handle == 0) {
std::cout << "Unknown ";
if (argc == 2) {
std::cout << "argument ";
for (int ii=1; ii<argc-1; ii++) {
std::string argument = argv[ii];
if (argument != "--input-corpus" || argument != "-i") {
std::cout << argument;
if (argc-1 != ii) {
std::cout << " ";
}
}
}
}
else if (argc > 2) {
std::cout << "arguments ";
for (int ii=1; ii<argc; ii++) {
std::string argument = argv[ii];
if (argument != "--input-corpus" || argument != "-i") {
std::cout << argument;
if (argc-1 != ii) {
std::cout << " ";
}
}
}
}
std::cout << std::endl;
handle = -1;
}
}
return CommandLineArgs{handle, vocab_size, filename};
};
int main(int argc, char *argv[]) {
CommandLineArgs ArgsCodes = handle_cli(argc, argv);
int handle = ArgsCodes.handle;
unsigned long long int vocab_size = ArgsCodes.vocab_size;
std::string filename = ArgsCodes.filename;
if (handle == 0 || handle == 1) {
tokenizers::SubwordTextEncoder TextEncoder(1000, "Test");
std::list<std::string> texts = load_text(filename);
auto l_front = texts.cbegin();
std::cout << "Example Of Vocab: " << *l_front << std::endl;
TextEncoder.build_vocabulary(texts);
std::list<std::string> vocab = TextEncoder.get_vocabulary();
std::cout << "Top 5 Vocabulary: " << std::endl;
std::cout << "Size of largest word " << TextEncoder.largest_word << std::endl;
std::cout << "Size of vocabulary " << TextEncoder.get_vocab_size() << std::endl;
print_table(vocab, TextEncoder.largest_word);
std::list<int> encoded_words = TextEncoder.encode("Hello");
std::cout << "Hello encoded ";
for (auto &word: encoded_words) {
std::cout << word << " ";
}
std::cout << std::endl;
}
else if (handle == 2){
tokenizers::SubwordTextEncoder TextEncoder(vocab_size, "Test");
std::list<std::string> texts = load_text(filename);
auto l_front = texts.cbegin();
std::cout << "Example Of Vocab: " << *l_front << std::endl;
TextEncoder.build_vocabulary(texts);
std::list<std::string> vocab = TextEncoder.get_vocabulary();
std::cout << "Top 5 Vocabulary: " << std::endl;
std::cout << "Size of largest word " << TextEncoder.largest_word << std::endl;
std::cout << "Size of vocabulary " << TextEncoder.get_vocab_size() << std::endl;
print_table(vocab, TextEncoder.largest_word);
}
return 0;
};