Merge pull request #17 from aodn/refactor-main

Refactor main
aodn · Oct 30, 2024 · fa893af · fa893af
2 parents 5d83a14 + c3a4058
commit fa893af
Show file tree

Hide file tree

Showing 17 changed files with 706 additions and 4,715 deletions.
diff --git a/data_discovery_ai/common/keyword_classification_parameters.ini b/data_discovery_ai/common/keyword_classification_parameters.ini
@@ -0,0 +1,19 @@
+[preprocessor]
+vocabs = AODN Instrument Vocabulary, AODN Discovery Parameter Vocabulary, AODN Platform Vocabulary
+rare_label_threshold = 10
+test_size = 0.2
+n_splits = 5
+train_test_random_state = 42
+
+[keywordModel]
+dropout = 0.3
+learning_rate = 0.001
+fl_gamma = 2
+fl_alpha = 0.7
+epoch = 100
+batch = 32
+early_stopping_patience = 5
+reduce_lr_patience = 5
+validation_split = 0.2
+confidence = 0.4
+top_N = 2
diff --git a/data_discovery_ai/common/parameters.json b/data_discovery_ai/common/parameters.json
diff --git a/data_discovery_ai/model/keywordModel.py b/data_discovery_ai/model/keywordModel.py
@@ -1,12 +1,7 @@
 """
-The keyword classification model used to identify the potential keywords for non-categorised records.
-X is the embedding of the description, Y is its keywords
+    The keyword classification model used to identify the potential keywords for non-categorised records.
 """
 
-import torch
-from transformers import BertTokenizer, BertModel
-from sklearn.preprocessing import MultiLabelBinarizer
-from sklearn.model_selection import train_test_split
 from sklearn.metrics import (
     accuracy_score,
     precision_score,
@@ -31,12 +26,12 @@
 from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
 from sklearn.utils.class_weight import compute_class_weight
 from tensorflow.keras import backend as K
-
+from tensorflow.keras.models import load_model
 
 import logging
 from matplotlib import pyplot as plt
 from datetime import datetime
-
+from typing import Dict, Callable, Any, Tuple, Optional, List
 import os
 
 os.environ["TF_USE_LEGACY_KERAS"] = "1"
@@ -45,31 +40,35 @@
 logger.setLevel(logging.INFO)
 
 
-def get_class_weights(Y_train):
+def get_class_weights(Y_train: np.ndarray) -> Dict[int, float]:
+    """
+    Calculate label weights by the frequency of a label appears in all records
+    Input:
+        Y_train: numpy.ndarray. The train set of Y
+    Output:
+        label_weight_dic: Dict[int, float]. The label weights, keys are the indexs of labels and values are the weights.
+    """
     label_frequency = np.sum(Y_train, axis=0)
-    total_samples = Y_train.shape[0]
     epsilon = 1e-6
     label_weights = np.minimum(1, 1 / (label_frequency + epsilon))
 
     label_weight_dict = {i: label_weights[i] for i in range(len(label_weights))}
     return label_weight_dict
 
 
-def baseline(X_train, Y_train, model):
-    if model == "KNN":
-        baseline_model = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=5))
-        baseline_model.fit(X_train, Y_train)
-    if model == "DT":
-        baseModel = DecisionTreeClassifier(random_state=42)
-        baseline_model = MultiOutputClassifier(baseModel)
-        baseline_model.fit(X_train, Y_train)
-    # TODO: add more baseline models
-
-    return baseline_model
-
+def focal_loss(
+    gamma: float, alpha: float
+) -> Callable[[tf.Tensor, tf.Tensor], tf.Tensor]:
+    """
+    Creates a focal loss function with specified gamma and alpha parameters. To address imbalanced class.
+    Input:
+        gamma: int. parameter that controls the down-weighting of easy examples. Higher gamma increases the effect.
+        alpha: float. Should be in the range of [0,1]. Balancing factor for positive vs negative classes.
+    Output:
+        Callable[[tf.Tensor, tf.Tensor], tf.Tensor]: A focal loss function that takes in `y_true` (true labels) and `y_pred` (predicted labels) tensors and returns the focal loss as a tensor.
+    """
 
-def focal_loss(gamma, alpha):
-    def focal_loss_fixed(y_true, y_pred):
+    def focal_loss_fixed(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
         epsilon = tf.keras.backend.epsilon()
         y_pred = tf.clip_by_value(y_pred, epsilon, 1.0 - epsilon)
         y_true = tf.cast(y_true, tf.float32)
@@ -82,42 +81,75 @@ def focal_loss_fixed(y_true, y_pred):
 
 
 def keyword_model(
-    X_train, Y_train, X_test, Y_test, class_weight, dim, n_labels, params
-):
+    model_name: str,
+    X_train: np.ndarray,
+    Y_train: np.ndarray,
+    X_test: np.ndarray,
+    Y_test: np.ndarray,
+    class_weight: Dict[int, float],
+    dim: int,
+    n_labels: int,
+    params: Dict[str, Any],
+) -> Tuple[Sequential, Any, str]:
+    """
+    Builds, trains, and evaluates a multi-label classification model for keyword prediction. Train neural network model with configurable hyperparameters (through `common/keyword_classification_parameters.json`), compiles it with a focal loss function, and trains it on the provided training data.
+    It also saves the trained model and evaluates it on test data. The saved model can called by the keywordClassifier API service.
+    Input:
+        model_name: str. The file name which saves the model.
+        X_train: np.ndarray. The training feature matrix.
+        Y_train: np.ndarray. The training target matrix.
+        X_test: np.ndarray. The test feature matrix.
+        Y_test: np.ndarray. The test target matrix.
+        class_weight: Dict[int, float]. Class weights for handling class imbalance. Calculated via function get_class_weights
+        dim: int. The input feature dimension for the model.
+        n_labels: int. The number of output labels for the model.
+        params: Dict[str, Any]: A dictionary of hyperparameters, including:
+            "dropout": float. Dropout rate for regularization.
+            "learning_rate": float. Learning rate for the Adam optimizer.
+            "fl_gamma: float. Gamma parameter for focal loss.
+            "fl_alpha": float. Alpha parameter for focal loss.
+            "epoch": int. Number of training epochs.
+            "batch": int. Batch size for training.
+            "early_stopping_patience": int. Patience for early stopping.
+            "reduce_lr_patience": int. Patience for reducing learning rate.
+            "validation_split": float. Fraction of data for validation during training.
+    Output:
+        model, history: Tuple[Sequential, Any]. The trained Keras model and the training history.
+    """
     current_time = datetime.now().strftime("%Y%m%d%H%M%S")
     model = Sequential(
         [
             Input(shape=(dim,)),
             Dense(128, activation="relu"),
-            Dropout(params["keywordModel"]["dropout"]),
+            Dropout(params.getfloat("keywordModel", "dropout")),
             # Dense(64, activation='relu'),
             # Dropout(params["keywordModel"]["dropout"]),
             Dense(n_labels, activation="sigmoid"),
         ]
     )
 
     model.compile(
-        optimizer=Adam(learning_rate=params["keywordModel"]["learning_rate"]),
+        optimizer=Adam(learning_rate=params.getfloat("keywordModel", "learning_rate")),
         loss=focal_loss(
-            gamma=params["keywordModel"]["fl_gamma"],
-            alpha=params["keywordModel"]["fl_alpha"],
+            gamma=params.getint("keywordModel", "fl_gamma"),
+            alpha=params.getfloat("keywordModel", "fl_alpha"),
         ),
         metrics=["accuracy", "precision", "recall"],
     )
 
     model.summary()
 
-    epoch = params["keywordModel"]["epoch"]
-    batch_size = params["keywordModel"]["batch"]
+    epoch = params.getint("keywordModel", "epoch")
+    batch_size = params.getint("keywordModel", "batch")
 
     early_stopping = EarlyStopping(
         monitor="val_loss",
-        patience=params["keywordModel"]["early_stopping_patience"],
+        patience=params.getint("keywordModel", "early_stopping_patience"),
         restore_best_weights=True,
     )
     reduce_lr = ReduceLROnPlateau(
         monitor="val_loss",
-        patience=params["keywordModel"]["reduce_lr_patience"],
+        patience=params.getint("keywordModel", "reduce_lr_patience"),
         min_lr=1e-6,
     )
 
@@ -127,19 +159,26 @@ def keyword_model(
         epochs=epoch,
         batch_size=batch_size,
         class_weight=class_weight,
-        validation_split=params["keywordModel"]["validation_split"],
+        validation_split=params.getfloat("keywordModel", "validation_split"),
         callbacks=[early_stopping, reduce_lr],
     )
-
-    model.save(
-        f"data_discovery_ai/output/{current_time}-trained-keyword-epoch{epoch}-batch{batch_size}.keras"
-    )
+    if model_name is None:
+        model_name = f"{current_time}-trained-keyword-epoch{epoch}-batch{batch_size}"
+    model.save(f"data_discovery_ai/output/{model_name}.keras")
 
     model.evaluate(X_test, Y_test)
-    return model, history
-
-
-def evaluation(Y_test, predictions):
+    return model, history, model_name
+
+
+def evaluation(Y_test: np.ndarray, predictions: np.ndarray) -> Dict[str, float]:
+    """
+    Evaluate the predicted labels via trained model with test set. The metrics computed are accuracy, Hamming loss, precision, recall, F1 score, and Jaccard index.
+    Input:
+        Y_test: np.ndarray. The true labels from test set.
+        predictions: np.ndarray. The predicted labels from the trained model.
+    Output:
+        Dict[str, float]: A dictionary containing the calculated evaluation metrics, including precision, recall, F1 score, Hamming loss, Jaccard index, and accuracy.
+    """
     accuracy = accuracy_score(Y_test, predictions)
     hammingloss = hamming_loss(Y_test, predictions)
     precision = precision_score(Y_test, predictions, average="micro")
@@ -158,32 +197,101 @@ def evaluation(Y_test, predictions):
     }
 
 
-def prediction(X, model, confidence, top_N):
+def prediction(X: np.ndarray, model: Any, confidence: float, top_N: int) -> np.ndarray:
+    """
+    Apply the trained model to generate predictions for the input data X. It uses a confidence threshold to determine predicted labels, marking as 1 for labels with probabilities above the confidence level. If no labels are above the threshold for a sample, the function selects the top N highest probabilities, marking them as positive labels (value 1).
+    Input:
+        X: np.ndarray. The input feature X matrix used for making predictions.
+        model: Any. The trained model (baseline model or Sequential model).
+        confidence: float. In the range of [0,1]. The confidence threshold for assigning labels. Predictions above this value are marked as 1.
+        top_N: The number of top predictions to select if no predictions meet the confidence threshold.
+    Output:
+        predicted_labels: np.ndarray. A binary matrix of predicted labels, where each row corresponds to a sample and each column to a label.
+    """
     predictions = model.predict(X)
     predicted_labels = (predictions > confidence).astype(int)
 
     for i in range(predicted_labels.shape[0]):
         if predicted_labels[i].sum() == 0:
             top_indices = np.argsort(predictions[i])[-top_N:]
             predicted_labels[i][top_indices] = 1
-
     return predicted_labels
 
 
-def replace_with_column_names(row, column_names):
+def replace_with_column_names(row: pd.SparseDtype, column_names: List[str]) -> str:
+    """
+    Transform a row of binary values and returns a string of column names (separated by " | ") for which the value in the row is 1.
+    Input:
+        row: pd.Series. A row of binary values indicating presence (1) or absence (0) of each label.
+        column_names: List[str]. The predefiend label set.
+    Output:
+        str: The predicted keywords, separated by " | "
+    """
     return " | ".join([column_names[i] for i, value in enumerate(row) if value == 1])
 
 
-def get_predicted_keywords(prediction, labels):
+def get_predicted_keywords(prediction: np.ndarray, labels: List[str]):
+    """
+    Convert binary predictions to textual keywords.
+    Input:
+        prediction: np.ndarray. The predicted binary matrix.
+        labels: List[str]. The predefiend keywords.
+    Output:
+        predicted_keywords: pd.Series. The predicted ketwords for the given targets.
+    """
     target_predicted = pd.DataFrame(prediction, columns=labels)
     predicted_keywords = target_predicted.apply(
         lambda row: replace_with_column_names(row, labels), axis=1
     )
     return predicted_keywords
-    # targetDS.drop(columns=["embedding", "keywords"], inplace=True)
 
-    # output = pd.concat([targetDS, predicted_keywords], axis=1)
-    # output.columns = ["id", "title", "description", "keywords"]
-    # current_time = datetime.now().strftime("%Y%m%d%H%M%S")
-    # output.to_csv(f"./output/saved/{current_time}.csv")
-    # logger.info(f"Save prediction to path output/saved/{current_time}.csv")
+
+def baseline(
+    X_train: np.ndarray, Y_train: np.ndarray, model: str
+) -> MultiOutputClassifier:
+    """
+    Trains a baseline multi-output classification model based on the specified algorithm (KNN or DT).
+    Input:
+        X_train: np.ndarray. The training feature matrix.
+        Y_train: np.ndarray. The training target matrix.
+        model:str. The type of baseline model to train. Options include:
+            - "KNN" for K-Nearest Neighbors.
+            - "DT" for Decision Tree.
+    Output:
+        baseline_model: MultiOutputClassifier. The trained baseline model.
+    """
+    if model == "KNN":
+        baseline_model = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=5))
+        baseline_model.fit(X_train, Y_train)
+    elif model == "DT":
+        baseModel = DecisionTreeClassifier(random_state=42)
+        baseline_model = MultiOutputClassifier(baseModel)
+        baseline_model.fit(X_train, Y_train)
+    # TODO: add more baseline models
+    else:
+        raise ValueError(
+            f"Unsupported model type: {model}. Please choose 'KNN' or 'DT'."
+        )
+
+    return baseline_model
+
+
+def load_saved_model(trained_model: str) -> Optional[load_model]:
+    """
+    Load a saved pretrained model from file, via a model name
+    Input:
+        trained_model: str. The name of the trained model file (without extension), located in the `data_discovery_ai/output/` directory.
+    Output:
+        Optional[keras_load_model]: The loaded Keras model if successful, otherwise `None`.
+    """
+    try:
+        saved_model = load_model(
+            f"data_discovery_ai/output/{trained_model}.keras", compile=False
+        )
+        return saved_model
+    except Exception as e:
+        print(e)
+        logger.info(
+            f"Failed to load selected model {trained_model} from folder data_discovery_ai/output"
+        )
+        return None
diff --git a/data_discovery_ai/output/20241024221305-trained-keyword-epoch100-batch32.keras b/data_discovery_ai/output/20241024221305-trained-keyword-epoch100-batch32.keras
diff --git a/data_discovery_ai/output/pretrainedKeyword4demo.keras b/data_discovery_ai/output/pretrainedKeyword4demo.keras