Merge pull request #21 from aodn/update-predict-endpoint

update predict endpoint
aodn · Nov 11, 2024 · d0a5939 · d0a5939
2 parents 740a0f9 + 815d28a
commit d0a5939
Show file tree

Hide file tree

Showing 27 changed files with 468 additions and 535 deletions.
diff --git a/.gitignore b/.gitignore
@@ -79,7 +79,9 @@ cython_debug/
 input/
 saved/
 backup/
-esManager.config
+esManager.ini
 
 .idea
 node_modules/
+
+**/output
diff --git a/README.md b/README.md
@@ -97,3 +97,54 @@ Every code change with commits following [Conventional Commits](https://github.c
 - `feat:` For new features
 - `fix:` For bug fixes
 - `BREAKING CHANGE:` For any breaking changes
+
+
+# Edge/syste/prod
+
+models name stricly controller:
+
+available options : `development`,`experimental`, `staging`, `production`, `benchmark`
+| Option | Purpose | Typical Use |
+| ---- | ---- | ---- |
+| `development` | Dedicated to active model development, testing, and iteration. | Building and refining new model versions, features, or datasets. |
+| `experimental` | Supports exploratory work for new techniques or fine-tuning. | Experimenting with new architectures, features, or hyperparameter tuning. |
+| `staging` | Prepares the model for production with real-use evaluations. | Conducting final testing in a production-like environment to verify stability and performance. |
+| `production` | Deployment environment for live model usage in real-world scenarios. | Running and monitoring models in active use by API. |
+| `benchmark` | Baseline model used to assess improvements or changes. | Comparing performance metrics against new models. |
+
+# Devlopment
+
+syntax is :/....
+
+
+# File Structure
+## Required Configuration Files
+1. Elasticsearch configuration file
+File name `esManager.ini` saved under folder `data_discovery_ai/common`. Specific fileds & values required:
+   1. `end_point`: the Elasticsearch endpoint of a deployment
+   2. `api_key`: the API key used for access Elasticsearch
+2. Keyword classification parameter configuration file
+File name `keyword_classification_parameters.ini` saved under folder `data_discovery_ai/common`. Required two sections: `preprocessor` to set up parameters used for data preprocessing module, and `keywordModel` to set up parameters used for training and evaluation of the keyword model. Here are the definitions of fields:
+   1. `preprocessor`
+
+   | Parameter | Definition | Default Value used |
+   | ---- | ---- | ---- |
+   | vocabs | Titles of vocabularies used to identify samples from raw data; multiple values can be separated by ', '. | AODN Instrument Vocabulary, AODN Discovery Parameter Vocabulary, AODN Platform Vocabulary |
+   | rare_label_threshold | The threshold for identifying a rare label, defined as the number of occurrences of the label across all sample records, should be an integer. | 10 |
+   | test_size | A floating-point number in the range [0, 1], indicating the percentage of the test set size relative to all samples. | 0.2 |
+   | n_splits | Number of re-shuffling & splitting iterations for cross validation, used as the value of parameter `n_splits` when initialise an object of `MultilabelStratifiedShuffleSplit`. | 5 |
+   | train_test_random_state | The seed for splitting the train and test sets, used as the value of the `random_state` parameter when initialising an instance of `MultilabelStratifiedShuffleSplit`. | 42 |
+
+   2. `keywordModel`
+
+   | Parameter | Definition | Defalt Value used |
+   | ---- | ---- | ---- |
+   | dropout | The probability of a neuron being dropped. A strategy used for avoiding overfitting. | 0.3 |
+   | learning_rate | A hyperparameter determines how much the model's parameters are adjusted with respect to the gradient of the loss function. | 0.001 |
+   | fl_gamma | The $\gamma$ parameter of the focal loss function, which adjusts the focus of the loss function on hard-to-classify samples. It should be an integer. | 2 |
+   | fl_alpha | The $\alpha$ parameter of the focal loss function, which balances the importance of positive and negative samples. It should be a floating-point number between 0 and 1. | 0.7 |
+   | epoch | The number of times the train set is passed through the model for training. It should be an integer. | 100 |
+   | batch | The batch size which defines the number of samples in each batch. | 32 |
+   | validation_split | The percentage of the training set to be used as the validation set. | 0.2 |
+   | confidence | The probability threshold for identifying a label as positive (value 1). | 0.5 |
+   | top_N | The number of labels to select using argmax(probability) if no labels reach the confidence threshold. | 2 |
diff --git a/data_discovery_ai/common/constants.py b/data_discovery_ai/common/constants.py
@@ -1,3 +1,7 @@
 API_PREFIX = "/api/v1/ml"
 API_KEY_NAME = "X-API-Key"
-AVAILABLE_MODELS = ["default", "a", "b"]  # just sample, use lowercase only
+AVAILABLE_MODELS = ["development", "staging", "production", "experimental", "benchmark"]
+KEYWORD_CONFIG = "keyword_classification_parameters.ini"
+ELASTICSEARCH_CONFIG = "esManager.ini"
+KEYWORD_SAMPLE_FILE = "keyword_sample.pkl"
+KEYWORD_LABEL_FILE = "keyword_label.pkl"
diff --git a/data_discovery_ai/model/ModelEntity.py b/data_discovery_ai/model/ModelEntity.py
diff --git a/data_discovery_ai/model/keywordModel.py b/data_discovery_ai/model/keywordModel.py
@@ -13,12 +13,8 @@
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.multioutput import MultiOutputClassifier
-from sklearn.svm import SVC
 import pandas as pd
-import ast
-import pickle
 import numpy as np
-from tqdm import tqdm
 import tensorflow as tf
 from tensorflow.keras.optimizers import Adam
 from tensorflow.keras.layers import Dense, Input, Dropout
@@ -29,10 +25,9 @@
 from tensorflow.keras.models import load_model
 
 import logging
-from matplotlib import pyplot as plt
-from datetime import datetime
 from typing import Dict, Callable, Any, Tuple, Optional, List
 import os
+from pathlib import Path
 
 os.environ["TF_USE_LEGACY_KERAS"] = "1"
 
@@ -116,7 +111,6 @@ def keyword_model(
     Output:
         model, history: Tuple[Sequential, Any]. The trained Keras model and the training history.
     """
-    current_time = datetime.now().strftime("%Y%m%d%H%M%S")
     model = Sequential(
         [
             Input(shape=(dim,)),
@@ -162,9 +156,15 @@ def keyword_model(
         validation_split=params.getfloat("keywordModel", "validation_split"),
         callbacks=[early_stopping, reduce_lr],
     )
-    if model_name is None:
-        model_name = f"{current_time}-trained-keyword-epoch{epoch}-batch{batch_size}"
-    model.save(f"data_discovery_ai/output/{model_name}.keras")
+    model_file_path = (
+        Path(__file__).resolve().parent.parent / "resources" / model_name
+    ).with_suffix(".keras")
+    # make sure folder exist
+    model_file_path.parent.mkdir(
+        parents=True, exist_ok=True
+    )  # Ensure the folder exists
+
+    model.save(model_file_path)
 
     model.evaluate(X_test, Y_test)
     return model, history, model_name
@@ -218,7 +218,9 @@ def prediction(X: np.ndarray, model: Any, confidence: float, top_N: int) -> np.n
     return predicted_labels
 
 
-def replace_with_column_names(row: pd.SparseDtype, column_names: List[str]) -> str:
+def replace_with_column_names(
+    row: pd.SparseDtype, column_names: List[str]
+) -> List[str]:
     """
     Transform a row of binary values and returns a string of column names (separated by " | ") for which the value in the row is 1.
     Input:
@@ -284,14 +286,15 @@ def load_saved_model(trained_model: str) -> Optional[load_model]:
     Output:
         Optional[keras_load_model]: The loaded Keras model if successful, otherwise `None`.
     """
+    model_file_path = (
+        Path(__file__).resolve().parent.parent / "resources" / trained_model
+    ).with_suffix(".keras")
     try:
-        saved_model = load_model(
-            f"data_discovery_ai/output/{trained_model}.keras", compile=False
-        )
+        saved_model = load_model(model_file_path, compile=False)
         return saved_model
     except Exception as e:
         print(e)
         logger.info(
-            f"Failed to load selected model {trained_model} from folder data_discovery_ai/output"
+            f"Failed to load selected model {trained_model} from folder data_discovery_ai/resources"
         )
         return None
diff --git a/data_discovery_ai/output/best-trained-keyword.keras b/data_discovery_ai/output/best-trained-keyword.keras
diff --git a/data_discovery_ai/output/keywords_sample.pkl b/data_discovery_ai/output/keywords_sample.pkl
diff --git a/data_discovery_ai/output/keywords_target.pkl b/data_discovery_ai/output/keywords_target.pkl
diff --git a/data_discovery_ai/output/pretrainedKeyword4demo.keras b/data_discovery_ai/output/pretrainedKeyword4demo.keras