-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
106 lines (98 loc) · 5.11 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gc
import numpy as np
import pandas as pd
print("Pandas imported successfully", pd.__version__)
import re
from scipy import stats
from scipy.stats import skew, kurtosis
def q1(x):
return x.quantile(0.25)
def q3(x):
return x.quantile(0.75)
# Function to construct essays copied from here (small adjustments): https://www.kaggle.com/code/kawaiicoderuwu/essay-contructor
def getEssays(df):
textInputDf = df[['id', 'activity', 'cursor_position', 'text_change']]
textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']
valCountsArr = textInputDf['id'].value_counts(sort=False).values
lastIndex = 0
essaySeries = pd.Series()
for index, valCount in enumerate(valCountsArr):
currTextInput = textInputDf[['activity', 'cursor_position', 'text_change']].iloc[lastIndex : lastIndex + valCount]
lastIndex += valCount
essayText = ""
for Input in currTextInput.values:
if Input[0] == 'Replace':
replaceTxt = Input[2].split(' => ')
essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] +\
essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
continue
if Input[0] == 'Paste':
essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
continue
if Input[0] == 'Remove/Cut':
essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
continue
if "M" in Input[0]:
croppedTxt = Input[0][10:]
splitTxt = croppedTxt.split(' To ')
valueArr = [item.split(', ') for item in splitTxt]
moveData = (int(valueArr[0][0][1:]),
int(valueArr[0][1][:-1]),
int(valueArr[1][0][1:]),
int(valueArr[1][1][:-1]))
if moveData[0] != moveData[2]:
if moveData[0] < moveData[2]:
essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] +\
essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
else:
essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] +\
essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
continue
essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
essaySeries[index] = essayText
essaySeries.index = textInputDf['id'].unique()
return pd.DataFrame(essaySeries, columns=['essay'])
AGGREGATIONS = ['count', 'mean', 'std', 'min', 'max', 'first', 'last', 'sem', q1, 'median', q3, 'skew', 'sum'] # , pd.DataFrame.kurt
def split_essays_into_sentences(df):
essay_df = df
essay_df['id'] = essay_df.index
essay_df['sent'] = essay_df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
essay_df = essay_df.explode('sent')
essay_df['sent'] = essay_df['sent'].apply(lambda x: x.replace('\n','').strip())
# Number of characters in sentences
essay_df['sent_len'] = essay_df['sent'].apply(lambda x: len(x))
# Number of words in sentences
essay_df['sent_word_count'] = essay_df['sent'].apply(lambda x: len(x.split(' ')))
essay_df = essay_df[essay_df.sent_len!=0].reset_index(drop=True)
return essay_df
def compute_sentence_aggregations(df):
sent_agg_df = pd.concat(
[df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
)
sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
sent_agg_df['id'] = sent_agg_df.index
sent_agg_df = sent_agg_df.reset_index(drop=True)
sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
return sent_agg_df
def split_essays_into_paragraphs(df):
essay_df = df
essay_df['id'] = essay_df.index
essay_df['paragraph'] = essay_df['essay'].apply(lambda x: x.split('\n'))
essay_df = essay_df.explode('paragraph')
# Number of characters in paragraphs
essay_df['paragraph_len'] = essay_df['paragraph'].apply(lambda x: len(x))
# Number of words in paragraphs
essay_df['paragraph_word_count'] = essay_df['paragraph'].apply(lambda x: len(x.split(' ')))
essay_df = essay_df[essay_df.paragraph_len!=0].reset_index(drop=True)
return essay_df
def compute_paragraph_aggregations(df):
paragraph_agg_df = pd.concat(
[df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
)
paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
paragraph_agg_df['id'] = paragraph_agg_df.index
paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
paragraph_agg_df.drop(columns=["paragraph_word_count_count"], inplace=True)
paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
return paragraph_agg_df