-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
103 lines (84 loc) · 3.35 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json
import re
import os
from collections import defaultdict
DIR_PATH = 'logs/'
# Change to your discord username and discriminator (after the #)
USER_NAME = "bagel"
DISCRIMINATOR = "4824"
reply_counter = 0
word_counter = 0
# regex compile emojis
emoj = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
def clean_text(text):
text = re.sub(r'@\w+', '', text) # discord mentions
text = re.sub(r":(\w+):", "", text) # discord emojis
text = re.sub(r'http\S+', "", text) # links
text = re.sub(r'\|+|[\*_]+|\`+|\~\~+|\>+|\>+>+|\n',
"", text) # discord markdown
text = re.sub(emoj, '', text) # emojis
text += " "
return text
# create file if it doesnt exist
try:
f = open("output.jsonl", "x")
except:
# clear output file if it does exist
with open('output.jsonl', 'w') as outfile:
pass
file_list = os.listdir(DIR_PATH)
# get every file in DIR_PATH subdirectory
for file in file_list:
with open('{}{}'.format(DIR_PATH, file), 'r', encoding="utf8") as f:
data = json.load(f)
messages = data["messages"]
message_dict = defaultdict(list)
for msg in messages:
message_dict[msg["id"]].append(msg["content"])
user_replies = [msg for msg in messages if (
msg["author"]["name"] == USER_NAME and msg["author"]["discriminator"] == DISCRIMINATOR and msg["type"] == "Reply")]
# get id of msg that the user replied to
prompt_completion = {}
for reply in user_replies:
prompt_id = reply["reference"]["messageId"]
prompt_msg = message_dict[prompt_id]
content_msg = reply["content"]
# if the msg exists (not deleted)
if prompt_msg:
prompt_msg = prompt_msg[0]
prompt_msg = clean_text(prompt_msg)
content_msg = clean_text(content_msg)
# if the msg is more than 4 words
if len(prompt_msg.split(" ")) > 4 and len(content_msg.split(" ")) > 4:
reply_counter += 1
word_counter += len(prompt_msg.split(" ")) + \
len(content_msg.split(" "))
prompt_completion[prompt_msg.strip()] = content_msg.strip()
with open('output.jsonl', 'a') as outfile:
for key in prompt_completion.keys():
json.dump({"prompt": "{}\n\n".format(key),
"completion": " {}\n".format(prompt_completion[key])}, outfile)
outfile.write('\n')
print("Operation complete, find the log in output.jsonl")
print("Replies Parsed:", reply_counter)
print("Words Parsed:", word_counter)
print("Approximate Tokens:", round(word_counter * 1000/750))