-
Notifications
You must be signed in to change notification settings - Fork 0
/
header_rf.py
68 lines (48 loc) · 1.79 KB
/
header_rf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
'''
Ensuring that the headings_df contains the correct data preprocessing steps.
'''
#%%
import pandas as pd
import plotly.express as px
import numpy as np
from utilities.utils import preprocess_heading_text
from utilities.utils import validate_data_types
df = pd.read_csv("data/headings/headings_df.csv", index_col = 0)
validate_data_types(df, str_columns = ['Heading Text', 'Heading Title', 'Person/Job/Org/None'])
def separate_into_groups(df):
''' Takes in the dataframe and outputs the relevant job/person filtered groups '''
job = df[df['Person/Job/Org/None'] == 'Job']
person = df[df['Person/Job/Org/None'] == 'Person']
org = df[df['Person/Job/Org/None'] == 'Org']
return job, person, org
df['Heading Text'] = df['Heading Text'].apply(preprocess_heading_text)
df = df[(df['Person/Job/Org/None'] == 'Job' )|(df['Person/Job/Org/None'] == 'Person')]
#%%
from utilities.rf import RFAnalysis
from utilities.utils import to_wcdf
# Get features
X = to_wcdf(df['Heading Text'])
# Get target
y = df['Person/Job/Org/None']
#%%
rf = RFAnalysis()
# Split the data
X_train, X_test, y_train, y_test = rf.train_test_split(X, y)
# Train the model
print("Training model...")
fitted_grid = rf.rf_pipeline(X_train, y_train)
# Save the model
print("Saving model...")
rf.save_model(fitted_grid, f'/Users/mtaruno/Documents/DevZone/job-research/data/models')
#%%
# Evaluate the model
print("Evaluating model...")
rf.evaluate_model(X_train, y_train, X_test, y_test, fitted_grid)
#%%
# Get the feature importances
print("Getting feature importances...")
importances = rf.get_feature_importances(fitted_grid, feature_names = X.columns.tolist(), save_directory_path = '/Users/mtaruno/Documents/DevZone/job-research/data/artifacts/importances/{}')
# %%
rf.visualize_feature_importances(importances)
# %%
# %%