diff --git a/.gitignore b/.gitignore index e2077f9..a72697f 100644 --- a/.gitignore +++ b/.gitignore @@ -79,7 +79,9 @@ cython_debug/ input/ saved/ backup/ -esManager.config +esManager.ini .idea node_modules/ + +**/output diff --git a/README.md b/README.md index 7316032..932579d 100644 --- a/README.md +++ b/README.md @@ -97,3 +97,54 @@ Every code change with commits following [Conventional Commits](https://github.c - `feat:` For new features - `fix:` For bug fixes - `BREAKING CHANGE:` For any breaking changes + + +# Edge/syste/prod + +models name stricly controller: + +available options : `development`,`experimental`, `staging`, `production`, `benchmark` +| Option | Purpose | Typical Use | +| ---- | ---- | ---- | +| `development` | Dedicated to active model development, testing, and iteration. | Building and refining new model versions, features, or datasets. | +| `experimental` | Supports exploratory work for new techniques or fine-tuning. | Experimenting with new architectures, features, or hyperparameter tuning. | +| `staging` | Prepares the model for production with real-use evaluations. | Conducting final testing in a production-like environment to verify stability and performance. | +| `production` | Deployment environment for live model usage in real-world scenarios. | Running and monitoring models in active use by API. | +| `benchmark` | Baseline model used to assess improvements or changes. | Comparing performance metrics against new models. | + +# Devlopment + +syntax is :/.... + + +# File Structure +## Required Configuration Files +1. Elasticsearch configuration file +File name `esManager.ini` saved under folder `data_discovery_ai/common`. Specific fileds & values required: + 1. `end_point`: the Elasticsearch endpoint of a deployment + 2. `api_key`: the API key used for access Elasticsearch +2. Keyword classification parameter configuration file +File name `keyword_classification_parameters.ini` saved under folder `data_discovery_ai/common`. Required two sections: `preprocessor` to set up parameters used for data preprocessing module, and `keywordModel` to set up parameters used for training and evaluation of the keyword model. Here are the definitions of fields: + 1. `preprocessor` + + | Parameter | Definition | Default Value used | + | ---- | ---- | ---- | + | vocabs | Titles of vocabularies used to identify samples from raw data; multiple values can be separated by ', '. | AODN Instrument Vocabulary, AODN Discovery Parameter Vocabulary, AODN Platform Vocabulary | + | rare_label_threshold | The threshold for identifying a rare label, defined as the number of occurrences of the label across all sample records, should be an integer. | 10 | + | test_size | A floating-point number in the range [0, 1], indicating the percentage of the test set size relative to all samples. | 0.2 | + | n_splits | Number of re-shuffling & splitting iterations for cross validation, used as the value of parameter `n_splits` when initialise an object of `MultilabelStratifiedShuffleSplit`. | 5 | + | train_test_random_state | The seed for splitting the train and test sets, used as the value of the `random_state` parameter when initialising an instance of `MultilabelStratifiedShuffleSplit`. | 42 | + + 2. `keywordModel` + + | Parameter | Definition | Defalt Value used | + | ---- | ---- | ---- | + | dropout | The probability of a neuron being dropped. A strategy used for avoiding overfitting. | 0.3 | + | learning_rate | A hyperparameter determines how much the model's parameters are adjusted with respect to the gradient of the loss function. | 0.001 | + | fl_gamma | The $\gamma$ parameter of the focal loss function, which adjusts the focus of the loss function on hard-to-classify samples. It should be an integer. | 2 | + | fl_alpha | The $\alpha$ parameter of the focal loss function, which balances the importance of positive and negative samples. It should be a floating-point number between 0 and 1. | 0.7 | + | epoch | The number of times the train set is passed through the model for training. It should be an integer. | 100 | + | batch | The batch size which defines the number of samples in each batch. | 32 | + | validation_split | The percentage of the training set to be used as the validation set. | 0.2 | + | confidence | The probability threshold for identifying a label as positive (value 1). | 0.5 | + | top_N | The number of labels to select using argmax(probability) if no labels reach the confidence threshold. | 2 | diff --git a/data_discovery_ai/common/constants.py b/data_discovery_ai/common/constants.py index 3dcc63f..75696de 100644 --- a/data_discovery_ai/common/constants.py +++ b/data_discovery_ai/common/constants.py @@ -1,3 +1,7 @@ API_PREFIX = "/api/v1/ml" API_KEY_NAME = "X-API-Key" -AVAILABLE_MODELS = ["default", "a", "b"] # just sample, use lowercase only +AVAILABLE_MODELS = ["development", "staging", "production", "experimental", "benchmark"] +KEYWORD_CONFIG = "keyword_classification_parameters.ini" +ELASTICSEARCH_CONFIG = "esManager.ini" +KEYWORD_SAMPLE_FILE = "keyword_sample.pkl" +KEYWORD_LABEL_FILE = "keyword_label.pkl" diff --git a/data_discovery_ai/model/ModelEntity.py b/data_discovery_ai/model/ModelEntity.py deleted file mode 100644 index cd7357c..0000000 --- a/data_discovery_ai/model/ModelEntity.py +++ /dev/null @@ -1,157 +0,0 @@ -import numpy as np -import pandas as pd -from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Dense, Dropout, Input -from tensorflow.keras.optimizers import Adam -from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau -from tensorflow.keras.metrics import AUC -from sklearn.metrics import ( - accuracy_score, - hamming_loss, - precision_score, - recall_score, - f1_score, - jaccard_score, -) -from datetime import datetime - - -class BaseModel: - """ - Base Model for multi-label classification tasks: keywords, parameters, organisation - """ - - def __init__(self, model=None): - self.model = model - - def compile_model(self, optimizer, loss, metrics): - if self.model: - self.model.compile(optimizer=optimizer, loss=loss, metrics=metrics) - - def fit_model( - self, - X_train, - Y_train, - epochs, - batch_size, - validation_split, - callbacks, - class_weight=None, - ): - if self.model: - history = self.model.fit( - X_train, - Y_train, - epochs=epochs, - batch_size=batch_size, - validation_split=validation_split, - callbacks=callbacks, - class_weight=class_weight, - ) - return history - - def evaluate_model(self, X_test, Y_test): - if self.model: - return self.model.evaluate(X_test, Y_test) - - def save_model(self, filepath): - if self.model: - self.model.save(filepath) - - -class KeywordModel(BaseModel): - def __init__(self, dim, n_labels): - super().__init__() - self.dim = dim - self.n_labels = n_labels - self.build_model() - - def build_model(self): - self.model = Sequential( - [ - Input(shape=(self.dim,)), - Dense(128, activation="relu"), - Dropout(0.3), - Dense(64, activation="relu"), - Dropout(0.3), - Dense(self.n_labels, activation="sigmoid"), - ] - ) - - def train( - self, - X_train, - Y_train, - X_test, - Y_test, - class_weight=None, - epochs=100, - batch_size=32, - ): - self.compile_model( - optimizer=Adam(learning_rate=1e-3), - loss="binary_crossentropy", - metrics=["accuracy", "precision", "recall", AUC()], - ) - - early_stopping = EarlyStopping( - monitor="val_loss", patience=5, restore_best_weights=True - ) - reduce_lr = ReduceLROnPlateau(monitor="val_loss", patience=5, min_lr=1e-6) - - history = self.fit_model( - X_train, - Y_train, - epochs=epochs, - batch_size=batch_size, - validation_split=0.1, - callbacks=[early_stopping, reduce_lr], - class_weight=class_weight, - ) - - current_time = datetime.now().strftime("%Y%m%d%H%M%S") - filepath = f"./output/saved/{current_time}-trained-keyword-epoch{epochs}-batch{batch_size}.keras" - - self.save_model(filepath) - return history - - @staticmethod - def evaluation(Y_test, predictions): - accuracy = accuracy_score(Y_test, predictions) - hammingloss = hamming_loss(Y_test, predictions) - precision = precision_score(Y_test, predictions, average="micro") - recall = recall_score(Y_test, predictions, average="micro") - f1 = f1_score(Y_test, predictions, average="micro") - jaccard = jaccard_score(Y_test, predictions, average="samples") - - return { - "accuracy": accuracy, - "hammingloss": hammingloss, - "precision": precision, - "recall": recall, - "f1": f1, - "Jaccard Index": jaccard, - } - - def predict_and_save(self, ds, confidence, labels): - X = np.array(ds["embedding"].tolist()) - predictions = self.model.predict(X) - predicted_labels = (predictions > confidence).astype(int) - - # Get label details - predicted_keywords = [] - for i in range(len(predicted_labels)): - lab = np.where(predicted_labels[i] == 1)[0] - keywords = [labels[l] for l in lab] - if len(keywords) == 0: - predicted_keywords.append(None) - else: - predicted_keywords.append(" | ".join(keywords)) - - ds["keywords"] = predicted_keywords - ds.drop(columns=["embedding"], inplace=True) - - current_time = datetime.now().strftime("%Y%m%d%H%M%S") - filepath = f"./output/saved/{current_time}.csv" - ds.to_csv(filepath) - return ds diff --git a/data_discovery_ai/model/keywordModel.py b/data_discovery_ai/model/keywordModel.py index 5c065de..81aa08c 100644 --- a/data_discovery_ai/model/keywordModel.py +++ b/data_discovery_ai/model/keywordModel.py @@ -13,12 +13,8 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.multioutput import MultiOutputClassifier -from sklearn.svm import SVC import pandas as pd -import ast -import pickle import numpy as np -from tqdm import tqdm import tensorflow as tf from tensorflow.keras.optimizers import Adam from tensorflow.keras.layers import Dense, Input, Dropout @@ -29,10 +25,9 @@ from tensorflow.keras.models import load_model import logging -from matplotlib import pyplot as plt -from datetime import datetime from typing import Dict, Callable, Any, Tuple, Optional, List import os +from pathlib import Path os.environ["TF_USE_LEGACY_KERAS"] = "1" @@ -116,7 +111,6 @@ def keyword_model( Output: model, history: Tuple[Sequential, Any]. The trained Keras model and the training history. """ - current_time = datetime.now().strftime("%Y%m%d%H%M%S") model = Sequential( [ Input(shape=(dim,)), @@ -162,9 +156,15 @@ def keyword_model( validation_split=params.getfloat("keywordModel", "validation_split"), callbacks=[early_stopping, reduce_lr], ) - if model_name is None: - model_name = f"{current_time}-trained-keyword-epoch{epoch}-batch{batch_size}" - model.save(f"data_discovery_ai/output/{model_name}.keras") + model_file_path = ( + Path(__file__).resolve().parent.parent / "resources" / model_name + ).with_suffix(".keras") + # make sure folder exist + model_file_path.parent.mkdir( + parents=True, exist_ok=True + ) # Ensure the folder exists + + model.save(model_file_path) model.evaluate(X_test, Y_test) return model, history, model_name @@ -218,7 +218,9 @@ def prediction(X: np.ndarray, model: Any, confidence: float, top_N: int) -> np.n return predicted_labels -def replace_with_column_names(row: pd.SparseDtype, column_names: List[str]) -> str: +def replace_with_column_names( + row: pd.SparseDtype, column_names: List[str] +) -> List[str]: """ Transform a row of binary values and returns a string of column names (separated by " | ") for which the value in the row is 1. Input: @@ -284,14 +286,15 @@ def load_saved_model(trained_model: str) -> Optional[load_model]: Output: Optional[keras_load_model]: The loaded Keras model if successful, otherwise `None`. """ + model_file_path = ( + Path(__file__).resolve().parent.parent / "resources" / trained_model + ).with_suffix(".keras") try: - saved_model = load_model( - f"data_discovery_ai/output/{trained_model}.keras", compile=False - ) + saved_model = load_model(model_file_path, compile=False) return saved_model except Exception as e: print(e) logger.info( - f"Failed to load selected model {trained_model} from folder data_discovery_ai/output" + f"Failed to load selected model {trained_model} from folder data_discovery_ai/resources" ) return None diff --git a/data_discovery_ai/output/best-trained-keyword.keras b/data_discovery_ai/output/best-trained-keyword.keras deleted file mode 100644 index 73897d2..0000000 Binary files a/data_discovery_ai/output/best-trained-keyword.keras and /dev/null differ diff --git a/data_discovery_ai/output/keywords_sample.pkl b/data_discovery_ai/output/keywords_sample.pkl deleted file mode 100644 index 624c3d8..0000000 Binary files a/data_discovery_ai/output/keywords_sample.pkl and /dev/null differ diff --git a/data_discovery_ai/output/keywords_target.pkl b/data_discovery_ai/output/keywords_target.pkl deleted file mode 100644 index 371f3e4..0000000 Binary files a/data_discovery_ai/output/keywords_target.pkl and /dev/null differ diff --git a/data_discovery_ai/output/pretrainedKeyword4demo.keras b/data_discovery_ai/output/pretrainedKeyword4demo.keras deleted file mode 100644 index 31e4f03..0000000 Binary files a/data_discovery_ai/output/pretrainedKeyword4demo.keras and /dev/null differ diff --git a/pipeline.py b/data_discovery_ai/pipeline.py similarity index 74% rename from pipeline.py rename to data_discovery_ai/pipeline.py index 12f8ee0..122bcd8 100644 --- a/pipeline.py +++ b/data_discovery_ai/pipeline.py @@ -2,6 +2,12 @@ import data_discovery_ai.model.keywordModel as model import data_discovery_ai.utils.es_connector as connector import data_discovery_ai.service.keywordClassifier as keywordClassifier +from data_discovery_ai.utils.config_utils import ConfigUtil +from data_discovery_ai.common.constants import ( + AVAILABLE_MODELS, + KEYWORD_SAMPLE_FILE, + KEYWORD_LABEL_FILE, +) import numpy as np import json import pandas as pd @@ -9,6 +15,8 @@ from typing import Any, Dict, Tuple from dataclasses import dataclass import logging +import tempfile +import os logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -36,15 +44,39 @@ def __init__( usePretrainedModel: bool. Choose whether to use pretrained model or train the model and then to be used. If set as True, the model_name should be given. model_name: str. The model name that saved in a .keras file. """ - params = configparser.ConfigParser() - params.read("data_discovery_ai/common/keyword_classification_parameters.ini") - self.params = params + self.config = ConfigUtil() + self.params = self.config.load_keyword_config() self.isDataChanged = isDataChanged self.usePretrainedModel = usePretrainedModel + # validate model name with accepted values, defined in data_discovery_ai/common/constants.py self.model_name = model_name - self.model = None - if self.usePretrainedModel and self.model_name is None: - raise ValueError("model name should be given to use pretrained model") + if not self.is_valid_model(): + raise ValueError( + 'Available model name: ["development", "staging", "production", "experimental", "benchmark"]' + ) + # create temp folder + self.temp_dir = tempfile.mkdtemp() + # define labels for prediction + if isDataChanged: + self.labels = None + else: + base_dir = self.config.base_dif + full_label_path = base_dir / "resources" / KEYWORD_LABEL_FILE + self.labels = preprocessor.load_from_file(full_label_path) + + """ + Validate model name within fixed selections + Input: + model_name: str. The file name of the saved model. restricted within four options: development, staging, production, and test + """ + + def is_valid_model(self) -> bool: + valid_model_name = AVAILABLE_MODELS + self.model_name = self.model_name.lower() + if self.model_name in valid_model_name: + return True + else: + return False def fetch_raw_data(self) -> pd.DataFrame: """ @@ -52,7 +84,9 @@ def fetch_raw_data(self) -> pd.DataFrame: Output: raw_data: pd.DataFrame. A DataFrame containing the raw data retrieved from Elasticsearch. """ - client = connector.connect_es(config_path="./esManager.config") + es_config = self.config.load_es_config() + + client = connector.connect_es(es_config) raw_data = connector.search_es(client) return raw_data @@ -74,12 +108,11 @@ def prepare_sampleSet(self, raw_data: pd.DataFrame) -> pd.DataFrame: labelledDS = preprocessor.identify_sample(raw_data, vocabs) preprocessed_samples = preprocessor.sample_preprocessor(labelledDS, vocabs) sampleSet = preprocessor.calculate_embedding(preprocessed_samples) - preprocessor.save_to_file( - sampleSet, "data_discovery_ai/input/keyword_sample.pkl" - ) - sampleSet = preprocessor.load_from_file( - "data_discovery_ai/input/keyword_sample.pkl" - ) + + full_path = os.path.join(self.temp_dir, KEYWORD_SAMPLE_FILE) + + preprocessor.save_to_file(sampleSet, full_path) + sampleSet = preprocessor.load_from_file(full_path) return sampleSet def prepare_train_test_sets(self, sampleSet: pd.DataFrame) -> TrainTestData: @@ -109,8 +142,11 @@ def prepare_train_test_sets(self, sampleSet: pd.DataFrame) -> TrainTestData: # Prepare feature matrix (X) and label matrix (Y) from the sample set X, Y, Y_df, labels = preprocessor.prepare_X_Y(sampleSet) - # Save the labels to a file for persistence - preprocessor.save_to_file(labels, "data_discovery_ai/input/labels.pkl") + self.labels = labels + + # save labels for pretrained model to use for prediction + full_path = os.path.join(self.temp_dir, KEYWORD_LABEL_FILE) + preprocessor.save_to_file(labels, full_path) # Identify rare labels based on a predefined threshold rare_label_threshold = self.params.getint( @@ -179,6 +215,7 @@ def train_evaluate_model(self, train_test_data: TrainTestData) -> None: eval = model.evaluation( Y_test=train_test_data.Y_test, predictions=predicted_labels ) + print(eval) def make_prediction(self, description: str) -> str: """ @@ -190,13 +227,23 @@ def make_prediction(self, description: str) -> str: predicted_labels: str. The predicted keywords by the trained keyword classifier model """ predicted_labels = keywordClassifier.keywordClassifier( - trained_model=self.model_name, description=description + trained_model=self.model_name, description=description, labels=self.labels ) - logger.info(predicted_labels) + print(predicted_labels) return predicted_labels -def pipeline(isDataChanged, usePretrainedModel, description, selected_model): +def pipeline( + isDataChanged: bool, usePretrainedModel: bool, description: str, selected_model: str +) -> None: + """ + The keyword classifier pipeline. + Inputs: + isDataChanged: bool. The indicator to call the data preprocessing module or not. + usePretrainedModel: bool. The indicator to use the pretrained model or not. + description: str. The item description which is used for making prediction. + selected_model: str. The model name for a selected pretrained model. + """ keyword_classifier_pipeline = KeywordClassifierPipeline( isDataChanged=isDataChanged, usePretrainedModel=usePretrainedModel, @@ -209,27 +256,10 @@ def pipeline(isDataChanged, usePretrainedModel, description, selected_model): raw_data = keyword_classifier_pipeline.fetch_raw_data() sampleSet = keyword_classifier_pipeline.prepare_sampleSet(raw_data=raw_data) else: - sampleSet = preprocessor.load_from_file( - "data_discovery_ai/input/keyword_sample.pkl" - ) + base_dir = keyword_classifier_pipeline.config.base_dif + full_sampleSet_path = base_dir / "resources" / KEYWORD_SAMPLE_FILE + sampleSet = preprocessor.load_from_file(full_sampleSet_path) train_test_data = keyword_classifier_pipeline.prepare_train_test_sets(sampleSet) keyword_classifier_pipeline.train_evaluate_model(train_test_data) keyword_classifier_pipeline.make_prediction(description) - - -def test(): - item_description = """ - Ecological and taxonomic surveys of hermatypic scleractinian corals were carried out at approximately 100 sites around Lord Howe Island. Sixty-six of these sites were located on reefs in the lagoon, which extends for two-thirds of the length of the island on the western side. Each survey site consisted of a section of reef surface, which appeared to be topographically and faunistically homogeneous. The dimensions of the sites surveyed were generally of the order of 20m by 20m. Where possible, sites were arranged contiguously along a band up the reef slope and across the flat. The cover of each species was graded on a five-point scale of percentage relative cover. Other site attributes recorded were depth (minimum and maximum corrected to datum), slope (estimated), substrate type, total estimated cover of soft coral and algae (macroscopic and encrusting coralline). Coral data from the lagoon and its reef (66 sites) were used to define a small number of site groups which characterize most of this area.Throughout the survey, corals of taxonomic interest or difficulty were collected, and an extensive photographic record was made to augment survey data. A collection of the full range of form of all coral species was made during the survey and an identified reference series was deposited in the Australian Museum.In addition, less detailed descriptive data pertaining to coral communities and topography were recorded on 12 reconnaissance transects, the authors recording changes seen while being towed behind a boat. - The purpose of this study was to describe the corals of Lord Howe Island (the southernmost Indo-Pacific reef) at species and community level using methods that would allow differentiation of community types and allow comparisons with coral communities in other geographic locations. - """ - pipeline( - isDataChanged=False, - usePretrainedModel=False, - description=item_description, - selected_model="test_keyword_pipeline", - ) - - -if __name__ == "__main__": - test() diff --git a/data_discovery_ai/resources/README.md b/data_discovery_ai/resources/README.md new file mode 100644 index 0000000..c633325 --- /dev/null +++ b/data_discovery_ai/resources/README.md @@ -0,0 +1,40 @@ +# Resources +This document explain the artifacts generated/used for the keyword classification model. + +## `resouces/artifacts/*` +This folder saves artifacts used for notebooks, which includes the following files: + +- `keyword_sampke.pkl` + +## `resources/*.keras` +These files with suffix `.keras` indicate the pretrained models. The file name (without suffix) are used for selecting models, which are restrictly controlled within these options: + +| Option | Purpose | Typical Use | +| ---- | ---- | ---- | +| `development` | Dedicated to active model development, testing, and iteration. | Building and refining new model versions, features, or datasets. | +| `experimental` | Supports exploratory work for new techniques or fine-tuning. | Experimenting with new architectures, features, or hyperparameter tuning. | +| `staging` | Prepares the model for production with real-use evaluations. | Conducting final testing in a production-like environment to verify stability and performance. | +| `production` | Deployment environment for live model usage in real-world scenarios. | Running and monitoring models in active use by API. | +| `benchmark` | Baseline model used to assess improvements or changes. | Comparing performance metrics against new models. | + +The deployment of these versions should follow this workflow: `development` -> `experimental` -> `benchmark` -> `staging` -> `production` + +### Acceptance Cretrials +#### Development Stage +- [ ] Model is developted. +- [ ] Model meets basic performance benchmarks and has stable training metrics. +- [ ] Model is sufficiently tuned for further experimentation. + +#### Experimental Stage +- [ ] Model has consistent and improved performance metrics. +- [ ] Experiment logs are well-documented for reproducibility and understanding. + +#### Benchmark Stage +- [ ] Model performance exceed benchmark model(the copy of the production model) on selected evaluation metrics. + +#### Staging Stage +- [ ] Model is ready to go to production +- [ ] All integrations (i.e., API services) are validated and tested. + +#### Production Stage +- [ ] Deploy the model with monitoring metrics, user feedbacks, and source data changes. diff --git a/data_discovery_ai/resources/development.keras b/data_discovery_ai/resources/development.keras new file mode 100644 index 0000000..d901f72 Binary files /dev/null and b/data_discovery_ai/resources/development.keras differ diff --git a/data_discovery_ai/resources/keyword_label.pkl b/data_discovery_ai/resources/keyword_label.pkl new file mode 100644 index 0000000..cfc6623 Binary files /dev/null and b/data_discovery_ai/resources/keyword_label.pkl differ diff --git a/data_discovery_ai/resources/keyword_sample.pkl b/data_discovery_ai/resources/keyword_sample.pkl new file mode 100644 index 0000000..7904158 Binary files /dev/null and b/data_discovery_ai/resources/keyword_sample.pkl differ diff --git a/data_discovery_ai/output/__init__.py b/data_discovery_ai/resources/sample.pkl similarity index 100% rename from data_discovery_ai/output/__init__.py rename to data_discovery_ai/resources/sample.pkl diff --git a/data_discovery_ai/routes.py b/data_discovery_ai/routes.py index ebad416..e9a3160 100644 --- a/data_discovery_ai/routes.py +++ b/data_discovery_ai/routes.py @@ -4,8 +4,8 @@ from pydantic import BaseModel import logging from data_discovery_ai.common.constants import API_PREFIX -from data_discovery_ai.service.keywordClassifier import keywordClassifier from data_discovery_ai.utils.api_utils import api_key_auth, validate_model_name +from data_discovery_ai.pipeline import KeywordClassifierPipeline router = APIRouter(prefix=API_PREFIX) logger = logging.getLogger(__name__) @@ -22,13 +22,15 @@ async def hello(): return {"content": "Hello World!"} -@router.post("/predict-keyword", dependencies=[Depends(api_key_auth)]) -async def predict_keyword(payload: PredictKeywordRequest) -> dict[str, str]: - # TODO: just placeholder for now, the client where calling this endpoint should only know 2 things: selected - # model name, and the raw input - selected_model = validate_model_name(payload.selected_model) - raw_input = payload.raw_input - logger.info(f"selected_model: {selected_model}, raw_input: {raw_input}") - # predicted_keyword = keywordClassifier(None, None, None, None, None) - response = {"predicted_keyword": "sample_predicted_keyword"} +@router.post("/predict", dependencies=[Depends(api_key_auth)]) +async def predict_keyword(payload: PredictKeywordRequest): + # selected_model = validate_model_name(payload.selected_model) + keyword_classifier_pipeline = KeywordClassifierPipeline( + isDataChanged=False, usePretrainedModel=True, model_name=payload.selected_model + ) + logger.info( + f"selected_model: {payload.selected_model}, raw_input: {payload.raw_input}" + ) + predicted_labels = keyword_classifier_pipeline.make_prediction(payload.raw_input) + response = {"predicted_labels": predicted_labels.split(" | ")} return response diff --git a/data_discovery_ai/service/keywordClassifier.py b/data_discovery_ai/service/keywordClassifier.py index ebcd1ba..fb28ff3 100644 --- a/data_discovery_ai/service/keywordClassifier.py +++ b/data_discovery_ai/service/keywordClassifier.py @@ -1,20 +1,23 @@ import data_discovery_ai.utils.preprocessor as preprocessor import data_discovery_ai.model.keywordModel as model -import configparser +from data_discovery_ai.utils.config_utils import ConfigUtil +from data_discovery_ai.common.constants import KEYWORD_LABEL_FILE +from typing import List, Any -def keywordClassifier(trained_model, description): +def keywordClassifier( + trained_model: str, description: str, labels: List[Any] +) -> List[Any]: """ The keyword classifier service for API use. Input: - trained_model: str. The name of the trained model file (without extension), located in the `data_discovery_ai/output/` directory. E.g. to load from file `data_discovery_ai/output/pretrainedKeyword4demo.keras`, `traind_model=pretrainedKeyword4demo`. + trained_model: str. The name of the trained model file (without extension), located in the `data_discovery_ai/resources/` directory. E.g. to load from file `data_discovery_ai/output/pretrainedKeyword4demo.keras`, `traind_model=pretrainedKeyword4demo`. description: str. The abstract of a metadata record for predicting the keywords of the dataset. Output: - predicted_keyword: str. the predicted keywords, separate by " | ". + predicted_keyword: List of Concept objects, as json format. """ - - params = configparser.ConfigParser() - params.read("data_discovery_ai/common/keyword_classification_parameters.ini") + config = ConfigUtil() + params = config.load_keyword_config() selected_model = model.load_saved_model(trained_model) description_embedding = preprocessor.get_description_embedding(description) @@ -26,7 +29,5 @@ def keywordClassifier(trained_model, description): params.getfloat("keywordModel", "confidence"), params.getint("keywordModel", "top_N"), ) - - labels = preprocessor.load_from_file("data_discovery_ai/input/labels.pkl") prediction = model.get_predicted_keywords(target_predicted_labels, labels).to_list() - return " | ".join(prediction) + return prediction diff --git a/data_discovery_ai/utils/config_utils.py b/data_discovery_ai/utils/config_utils.py new file mode 100644 index 0000000..491831a --- /dev/null +++ b/data_discovery_ai/utils/config_utils.py @@ -0,0 +1,34 @@ +from pathlib import Path +from data_discovery_ai.common.constants import KEYWORD_CONFIG, ELASTICSEARCH_CONFIG +import configparser + + +class ConfigUtil: + def __init__(self) -> None: + self.base_dif = Path(__file__).resolve().parent.parent + + def _load_config(self, file_name: str) -> configparser.ConfigParser: + """ + The abstract method to load a configuration file. + """ + config_file_path = self.base_dif / "common" / file_name + if not config_file_path.exists(): + raise FileNotFoundError( + f"The configuration file was not found at {config_file_path}" + ) + + config = configparser.ConfigParser() + config.read(config_file_path) + return config + + def load_keyword_config(self) -> configparser.ConfigParser: + """ + The util method for load parameters for from a configuration file, which saved as data_discovery_ai/common/keyword_classification_parameters.ini + """ + return self._load_config(KEYWORD_CONFIG) + + def load_es_config(self) -> configparser.ConfigParser: + """ + The util method for load Elasticsearch configurations from a file, which saved as data_discovery_ai/common/esManager.ini + """ + return self._load_config(ELASTICSEARCH_CONFIG) diff --git a/data_discovery_ai/utils/data_preprocessor.py b/data_discovery_ai/utils/data_preprocessor.py deleted file mode 100644 index 385908a..0000000 --- a/data_discovery_ai/utils/data_preprocessor.py +++ /dev/null @@ -1,153 +0,0 @@ -import json -import pandas as pd -import csv -import re -import owslib - - -# Read search result from ES, which is copied from the console of ES and saved in a json file. Convert it to tsv -def json2tsv(input, output): - with open(f"./input/{input}", "r", encoding="utf-8") as f: - json_string = f.read() - data = json.loads(json_string) - indexes = data["hits"]["hits"] - df = pd.json_normalize(indexes) - - # df.columns = [c.split("_")[1] for c in list(df.columns)] - print(df.columns) - # df = df.map(lambda x: x.replace('\n', ' ').replace('\\n', ' ').replace(',', ' ') if isinstance(x, str) else x) - df.to_csv(f"./output/{output}", index=False, sep="\t") - - -# Explore dataset -def explore_dataset(ds): - print(f"Columns: {ds.columns} \n") - print(f"Shape: {ds.shape} \n") - print(ds.head) - - -def keywords_df(ds): - keywords = ds[["_id", "_source.title", "_source.description", "_source.themes"]] - keywords.columns = ["id", "title", "description", "keywords"] - # print(keywords.head) - - keywords.loc[:, "keywords"] = keywords["keywords"].apply(lambda k: eval(k)) - - # Step 1: identify target dataset: metadata has no keywords record - # filtered = keywords[keywords['keywords'].apply(lambda x: len(x) == 0)] - # filtered.loc[filtered['keywords'].apply(lambda x: len(x) == 0), 'keywords'] = None - # ds_to_tsv(filtered, "no_keywords.tsv") - - # Step 2: identify sample dataset: metadata keywords record uses AODN vocabulary - vocabs = [ - "AODN Organisation Vocabulary", - "AODN Instrument Vocabulary", - "AODN Discovery Parameter Vocabulary", - "AODN Platform Vocabulary", - "AODN Parameter Category Vocabulary", - ] - # vocabs = ['AODN Discovery Parameter Vocabulary'] - sample = keywords[ - keywords["keywords"].apply( - lambda terms: any( - any(vocab in k["title"] for vocab in vocabs) for k in terms - ) - ) - ] - - # Step 3: flattern sample table to get keywords table - result = pd.concat( - sample.apply(lambda row: flattern_keywords(row), axis=1).tolist(), - ignore_index=True, - ) - filter_result = result[result["vocabulary"].isin(vocabs)] - - # row = sample[sample['id'] == "52c92036-cea9-4b1a-b4f0-cc94b8b5df98"] - # rowdf = flattern_keywords(row) - ds_to_tsv(sample, "AODN_parameter_vocabs.tsv") - ds_to_tsv(filter_result, "AODN_parameter_vocabs_flattern.tsv") - - -def flattern_keywords(row): - id = [] - concept_id = [] - concept_url = [] - vocabolary = [] - - keywords = row["keywords"] - for k in keywords: - concept = k.get("concepts") - # print(k) - for c in concept: - id.append(row["id"]) - vocabolary.append(k.get("title")) - if c["id"] is not None: - concept_id.append(c["id"]) - try: - concept_url.append(c["url"]) - except KeyError as e: - concept_url.append(None) - - return pd.DataFrame( - { - "id": id, - "concept_id": concept_id, - "concept_url": concept_url, - "vocabulary": vocabolary, - } - ) - - -def parameter_df(ds): - ds = ds[ - [ - "_id", - "_source.title", - "_source.description", - "_source.summaries.parameter_vocabs", - ] - ] - ds.columns = ["id", "title", "description", "parameter"] - ds = ds.dropna(subset=["parameter"]) - print(ds.shape) - return ds - - -def ds_to_tsv(df, file_name): - df.to_csv(f"./output/{file_name}", index=False, sep="\t") - - -""" - Explore json fields -""" - - -def explore_jsonDS(jsonFile): - with open(f"./input/{jsonFile}", "r", encoding="utf-8") as finput: - data = json.load(finput) - hits = data["hits"]["hits"][0] - print(hits) - with open("./output/sample_test.json", "w") as foutput: - json.dump(hits, foutput, indent=4) - - -""" -Parse description phrases -""" - - -def description_phrases(): - ds = pd.read_csv("./output/AODN.tsv", sep="\t") - ds = ds[["_id", "_source.title", "_source.description"]] - ds_to_tsv(ds, file_name="AODN_description.tsv") - - -if __name__ == "__main__": - json2tsv("es_searchAll_result.json", "AODN.tsv") - - # ds = pd.read_csv("./output/AODN.tsv", sep='\t') - # keywords_df(ds) - # ds = pd.read_csv("./output/AODN.tsv", sep="\t") - # explore_dataset(ds) - - # description_phrases() diff --git a/data_discovery_ai/utils/es_connector.py b/data_discovery_ai/utils/es_connector.py index 44e21bc..6b70a8a 100644 --- a/data_discovery_ai/utils/es_connector.py +++ b/data_discovery_ai/utils/es_connector.py @@ -1,15 +1,13 @@ -from elasticsearch import Elasticsearch +from elasticsearch import ( + Elasticsearch, +) # TODO: please use poetry add command to install any new libraries import configparser -import json import logging import pandas as pd from tqdm import tqdm import time -from data_discovery_ai.utils.preprocessor import save_to_file -CONFIG_PATH = "./esManager.config" - logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -25,11 +23,7 @@ """ -def connect_es(config_path: str) -> Elasticsearch: - - config = configparser.ConfigParser() - config.read(config_path) - +def connect_es(config: configparser.ConfigParser) -> Elasticsearch: end_point = config["elasticsearch"]["end_point"] api_key = config["elasticsearch"]["api_key"] @@ -42,6 +36,8 @@ def connect_es(config_path: str) -> Elasticsearch: Search elasticsearch index, convert the json format to dataframe, save the dataframe to a pickle file Input: client: Elasticsearch. The initialised Elasticsearch client instance + Output: + raw_data: pd.DataFrame. The fetched raw data in a tabular format. """ @@ -75,7 +71,4 @@ def search_es(client: Elasticsearch): raw_data = pd.concat(dataframes, ignore_index=True) - save_to_file(raw_data, "./input/es-indexer-staging.pkl") - logging.info("Raw data saved to ./input/es-indexer-staging.pkl") - - # TODO: upload raw data to S3 + return raw_data diff --git a/data_discovery_ai/utils/fetech_catelogue.py b/data_discovery_ai/utils/fetech_catelogue.py deleted file mode 100644 index 929efb7..0000000 --- a/data_discovery_ai/utils/fetech_catelogue.py +++ /dev/null @@ -1,55 +0,0 @@ -from owslib.csw import CatalogueServiceWeb -import pandas as pd - - -# define item fields for use -record_format = { - "identifier": "", # record.identifier, the id of a record - "description": "", # record.identification.abstract, the description of an item - "keywords": "", # the list of keywords in json format for each thesaurus {"title1":"title","keywords1":[keywords, keywords,...]} -} - -# global varibales -current_position = 0 -max_batch_size = 10 - - -def record_process(record, record_format): - rec = record_format - rec["identifier"] = record.identifier - try: - item = record.identification[0] - rec["description"] = item.abstract - md_keywords = [] - if item.keywords: - for mk in item.keywords: - md_keywords.append( - { - "title": mk.thesaurus["title"], - "keywords": [md.name for md in mk.keywords], - } - ) - rec["keywords"] = md_keywords - else: - rec["keywords"] = None - return rec - except Exception as e: - pass - - -csw_url = "https://catalogue.aodn.org.au/geonetwork/srv/eng/csw?request=GetCapabilities&service=CSW&version=2.0.2" -csw = CatalogueServiceWeb(csw_url) -# print(csw.identification.type) -csw.getrecords2( - outputschema="http://standards.iso.org/iso/19115/-3/mdb/2.0", - esn="full", - maxrecords=10, -) - -total_records = csw.results["matches"] -print(total_records) - -# for r in csw.records: -# record = csw.records[r] -# rec = record_process(record, record_format) -# print(rec) diff --git a/data_discovery_ai/utils/preprocessor.py b/data_discovery_ai/utils/preprocessor.py index 31f9f4e..db2f351 100644 --- a/data_discovery_ai/utils/preprocessor.py +++ b/data_discovery_ai/utils/preprocessor.py @@ -19,37 +19,58 @@ from imblearn.over_sampling import RandomOverSampler, SMOTE from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit from tqdm import tqdm +from pathlib import Path +from typing import Dict +import tempfile + logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) -def save_to_file(obj: Any, file_name: str) -> None: +class Concept: + def __init__(self, id: str, url: str, vocab_type: str) -> None: + self.id = id + self.url = url + self.vocab_type = vocab_type + + def to_json(self) -> Dict[str, Any]: + return { + "vocab_type": self.vocab_type, + "value": self.id, + "url": self.url, + } + + def __eq__(self, other: object) -> bool: + if not isinstance(other, Concept): + return NotImplemented + + return ( + self.id == other.id + and self.url == other.url + and self.vocab_type == other.vocab_type + ) + + def __hash__(self): + return hash((self.id, self.url, self.vocab_type)) + + +def save_to_file(obj: Any, full_path: str) -> None: """ - Saves an object to a file using pickle serialization. This function saves the specified object to a file in binary format. If a specific folder path is required, include it in the `file_name`. - Input: - obj: Any. The object to be saved; no type restriction. - file_name: str. The name of the file (including path if necessary) to save the object to. - Output: - None, not return any value in this function + Saves an object to a file using pickle serialization in the specific file path """ - with open(file_name, "wb") as file: + with open(full_path, "wb") as file: pickle.dump(obj, file) - logger.info(f"Saved to {file}") + logger.info(f"Saved to {full_path}") -def load_from_file(file_name: str) -> Any: +def load_from_file(full_path: str) -> Any: """ - Loads an object from a file using pickle deserialization. This function reads a binary file and reconstructs the original object - saved in the file. It is useful for loading objects previously used `save_to_file`. - Input: - file_name: str. The name of the file (including path if necessary) to load the object from. - Output: - obj: Any. The objected that was loaded from a file. No type restriction. + Loads an object from a file in the input folder using pickle deserialization. """ - with open(file_name, "rb") as file: + with open(full_path, "rb") as file: obj = pickle.load(file) - logger.info(f"Load from {file}") + logger.info(f"Load from {full_path}") return obj @@ -67,7 +88,6 @@ def identify_sample(raw_data: pd.DataFrame, vocabs: List[str]) -> pd.DataFrame: ["_id", "_source.title", "_source.description", "_source.themes"] ] raw_data_cleaned.columns = ["id", "title", "description", "keywords"] - sampleSet = raw_data_cleaned[ raw_data_cleaned["keywords"].apply( lambda terms: any( @@ -182,7 +202,8 @@ def calculate_embedding(ds: pd.DataFrame) -> pd.DataFrame: ds: pd.DataFrame, the dataset with one more embedding column """ tqdm.pandas() - ds["embedding"] = ds["description"].progress_apply( + ds["information"] = ds["title"] + ": " + ds["description"] + ds["embedding"] = ds["information"].progress_apply( lambda x: get_description_embedding(x) ) return ds @@ -217,7 +238,7 @@ def keywords_formatter(text: Union[str, List[dict]], vocabs: List[str]) -> List[ text: Union[str, List[dict]. The input keywords, expected to be a list of dictionaries, can be passed as a string representation of the list. vocabs: List[str]. A list of vocabulary names to match against keyword titles. Output: - A list of formatted keywords, with duplicates removed, in the form `title:id`. + A list of formatted keywords, with duplicates removed, in the form `title;id`. """ if type(text) is list: keywords = text @@ -227,9 +248,14 @@ def keywords_formatter(text: Union[str, List[dict]], vocabs: List[str]) -> List[ for keyword in keywords: for concept in keyword["concepts"]: if keyword["title"] in vocabs and concept["id"] != "": - concept_str = keyword["title"] + ":" + concept["id"] + con = Concept( + id=concept["id"].lower(), + url=concept["url"], + vocab_type=keyword["title"], + ) + concept_str = con.to_json() k_list.append(concept_str) - return list(set(k_list)) + return list(k_list) def prepare_train_test( @@ -353,25 +379,3 @@ def resampling( print(f"X resampled set size: {X_train_resampled.shape[0]}") print(f"Y resampled set size: {Y_train_resampled.shape[0]}") return X_train_resampled, Y_train_resampled - - -def load_sample(): - """ - Load sample set from a saved file. For demo use only. - """ - try: - sampleDS = load_from_file("../data_discovery_ai/input/keywords_sample.pkl") - return sampleDS - except Exception as e: - logger.info("Files not Found: Missing keywords_sample.pkl in output folder.") - - -def load_target(): - """ - Load prediction set from a saved file. For demo use only. - """ - try: - targetDS = load_from_file("../data_discovery_ai/input/keywords_target.pkl") - return targetDS - except Exception as e: - logger.info("Files not Found: Missing keywords_target.pkl in output folder.") diff --git a/notebooks/KeywordClassificationNonTechNotebook.ipynb b/notebooks/KeywordClassificationNonTechNotebook.ipynb index 1c7192a..7146e1d 100644 --- a/notebooks/KeywordClassificationNonTechNotebook.ipynb +++ b/notebooks/KeywordClassificationNonTechNotebook.ipynb @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -49,12 +49,14 @@ "if module_path not in sys.path:\n", " sys.path.append(module_path+\"\\\\data_discovery_ai\\\\utils\")\n", " sys.path.append(module_path+\"\\\\data_discovery_ai\\\\model\")\n", + " sys.path.append(module_path+\"\\\\data_discovery_ai\\\\common\")\n", "\n", "current_path = os.getcwd()\n", "\n", "# import modules\n", "import preprocessor\n", - "import keywordModel" + "import keywordModel\n", + "import constants" ] }, { @@ -66,44 +68,16 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: './data_discovery_ai/input/keyword_sample.pkl'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[6], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m labelledSet \u001b[38;5;241m=\u001b[39m \u001b[43mpreprocessor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_from_file\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m./data_discovery_ai/input/keyword_sample.pkl\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[0;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[1;32mc:\\Users\\yhu12\\OneDrive - University of Tasmania\\IMOS\\DataDiscovery\\data-discovery-ai\\data_discovery_ai\\utils\\preprocessor.py:29\u001b[0m, in \u001b[0;36mload_from_file\u001b[1;34m(file_name)\u001b[0m\n\u001b[0;32m 19\u001b[0m logger \u001b[38;5;241m=\u001b[39m logging\u001b[38;5;241m.\u001b[39mgetLogger(\u001b[38;5;18m__name__\u001b[39m)\n\u001b[0;32m 20\u001b[0m logger\u001b[38;5;241m.\u001b[39msetLevel(logging\u001b[38;5;241m.\u001b[39mINFO)\n\u001b[0;32m 22\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 23\u001b[0m \u001b[38;5;124;03m Saves an object to a file using pickle serialization. This function saves the specified object to a file in binary format. If a specific folder path is required, include it in the `file_name`.\u001b[39;00m\n\u001b[0;32m 24\u001b[0m \u001b[38;5;124;03m Input:\u001b[39;00m\n\u001b[0;32m 25\u001b[0m \u001b[38;5;124;03m obj: Any. The object to be saved; no type restriction.\u001b[39;00m\n\u001b[0;32m 26\u001b[0m \u001b[38;5;124;03m file_name: str. The name of the file (including path if necessary) to save the object to.\u001b[39;00m\n\u001b[0;32m 27\u001b[0m \u001b[38;5;124;03m Output:\u001b[39;00m\n\u001b[0;32m 28\u001b[0m \u001b[38;5;124;03m None, not return any value in this function\u001b[39;00m\n\u001b[1;32m---> 29\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 31\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msave_to_file\u001b[39m(\n\u001b[0;32m 32\u001b[0m obj: Any, \n\u001b[0;32m 33\u001b[0m file_name: \u001b[38;5;28mstr\u001b[39m\n\u001b[0;32m 34\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 35\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(file_name, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m file:\n", - "\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './data_discovery_ai/input/keyword_sample.pkl'" - ] - } - ], - "source": [ - "labelledSet = preprocessor.load_from_file(\n", - " \"./data_discovery_ai/input/keyword_sample.pkl\"\n", - " )" - ] + "outputs": [], + "source": [] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(1631, 5)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "labelledSet.shape" ] diff --git a/notebooks/KeywordClassificationTechNotebook.ipynb b/notebooks/KeywordClassificationTechNotebook.ipynb new file mode 100644 index 0000000..9d7a8c0 --- /dev/null +++ b/notebooks/KeywordClassificationTechNotebook.ipynb @@ -0,0 +1,98 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ML model for Keyword Classification - Tech Notebook\n", + "This notebook introduces (1) how to explore, prepare and preprocess the datasets; (2) how to train and evaluate the ML model; and (3) how to use this trained ML model, for technical audiences.\n", + "## Problem Description\n", + "The AODN catalogue $C=\\{M, K, P\\}$ serves as a platform for storing datasets and their associated metadata. $M=\\{m_1,m_2,\\ldots, m_x\\}$ is a set of metadata records which are used to describe the dataset in AODN catalogue $C$. $K=\\{k_1, k_2, \\ldots, k_y\\}$ is a set of pre-defined keywords that are used to categorise dataset. In the catalogue $C = \\{M, K\\}$, a subset of metadata records, $M_t \\subseteq M$, have not yet been categorised with keywords. For these records, $K_i = \\emptyset $ for all $m_i \\in M_t$. Given another subset of metadata records, $M_s \\subseteq M$, where each record has already been categorised with keywords (i.e., $K_i \\neq \\emptyset $ for all $m_i \\in M_s$). The research question is as follows:\n", + "\n", + "How to design and develop a machine learning model, denoted as $MM_{keywords}$, that can automatically label the uncategorised metadata records $M_t$ using a predefined set of keywords $K$. Specifically, the model should be trained to learn a mapping rule $d_i \\mapsto K_i$ based on the observed patterns from the sample set $M_s$, where each description $d_i$ of a metadata record $m_i \\in M_s$ is associated with a set of keywords $K_i$. Once trained, the model should be able to apply this learned mapping to accurately categorise the records in $M_t$ by assigning their corresponding keywords based on the records' descriptions.\n", + "\n", + "To simplify the task, we restrict the scope of keywords to those falling within the primary AODN vocabulary:\n", + "- AODN Instrument Vocabulary\n", + "- AODN Discovery Parameter Vocabulary\n", + "- AODN Platform Vocabulary\n", + "\n", + "Only keywords $k_j \\in K_i$ that are part of the listed AODN vocabularies will be considered. Any keyword not belonging to these vocabularies will be excluded from $K_i$ for all metadata records in the categorised metadata set $M_s$.\n", + "\n", + "### Formal Definitions\n", + "- **Definition 1: A metadata record $m_i=(d_i, K_i), m_i \\in M$** is a record describing a dataset. Specifically, $i$ is the unique identifier of the record. $d_i$ is a textual abstract that serves as the description of the dataset. $K_i \\subseteq K$ is a subset of keywords used to label the dataset.\n", + "- **Definition 2: A abstract $d_i$** is a piece of textual information which is used to describe the dataset. The embedding $\\mathbf{d_i}$ is a vector representation of the textual description $d_i$, calculated using the \"bert-base-uncased\" model. The embedding vector $\\mathbf{d_i}$ for each abstract $d_i$ has an universal dimensionality, denoted as $dim=|\\mathbf{d_i}|$. A feature matrix $\\mathbf{X}$ of a shape $|M_s| \\times dim$ aggregates the embeddings for the abstacts of all samples in $M_s$, where |M_s is the total number of metadata records.\n", + "- **Definition 3: A keyword $k_j$** is a predefined label used for catogarising datasets. Each metadata record $m_i$ is associated with a set of keywords $K_i \\subseteq K$, while $K$ is the complete set of predefined keywords. The keywords $K_i$ for a metadata record $m_i$ is mathematiacally represented as a binary vector $y_i$ with a size of $|K|$. where each element indicates the presence or absence of a specific label. A value of 1 at position $j$ denotes the label $k_j \\in K$ is present in the metadata record $m_i$, in this sence $k_j \\in K_i$, while a value of 0 indicates its absence. A target matrix $\\mathbf{Y}$ is a $|M_s| \\times |K|$ binary matrix, where $|M_s|$ is the size of the metadata records set $M_s=\\{m_1,m_2,\\ldots, m_x\\}$, and $|K|$ is the size of the keywords set $K=\\{k_1, k_2, \\ldots, k_y\\}$. Each entry $ \\mathbf{K}[i, j] $ is 1 if metadata record $ m_i $ is associated with keyword $ k_j $, and 0 otherwise.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add module path for notebook to use\n", + "import sys\n", + "import os\n", + "\n", + "module_path = os.path.abspath(os.path.join('..'))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path+\"\\\\data_discovery_ai\\\\utils\")\n", + " sys.path.append(module_path+\"\\\\data_discovery_ai\\\\model\")\n", + " sys.path.append(module_path+\"\\\\data_discovery_ai\\\\common\")\n", + "\n", + "current_path = os.getcwd()\n", + "\n", + "# import modules\n", + "import preprocessor\n", + "import keywordModel\n", + "import constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load sample used from resources\n", + "import pickle\n", + "\n", + "sample_path = module_path+\"\\\\data_discovery_ai\\\\resources\\\\\"+constants.KEYWORD_SAMPLE_FILE\n", + "with open(sample_path, \"rb\") as pickle_file:\n", + " sampleSet = pickle.load(pickle_file)\n", + "sampleSet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X, Y, Y_df, labels = preprocessor.prepare_X_Y(sampleSet)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/poetry.lock b/poetry.lock index 8a6ccdd..a6ccd86 100644 --- a/poetry.lock +++ b/poetry.lock @@ -376,6 +376,47 @@ files = [ {file = "distlib-0.3.9.tar.gz", hash = "sha256:a60f20dea646b8a33f3e7772f74dc0b2d0772d2837ee1342a00645c81edf9403"}, ] +[[package]] +name = "elastic-transport" +version = "8.15.1" +description = "Transport classes and utilities shared among Python Elastic client libraries" +optional = false +python-versions = ">=3.8" +files = [ + {file = "elastic_transport-8.15.1-py3-none-any.whl", hash = "sha256:b5e82ff1679d8c7705a03fd85c7f6ef85d6689721762d41228dd312e34f331fc"}, + {file = "elastic_transport-8.15.1.tar.gz", hash = "sha256:9cac4ab5cf9402668cf305ae0b7d93ddc0c7b61461d6d1027850db6da9cc5742"}, +] + +[package.dependencies] +certifi = "*" +urllib3 = ">=1.26.2,<3" + +[package.extras] +develop = ["aiohttp", "furo", "httpcore (<1.0.6)", "httpx", "opentelemetry-api", "opentelemetry-sdk", "orjson", "pytest", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "pytest-mock", "requests", "respx", "sphinx (>2)", "sphinx-autodoc-typehints", "trustme"] + +[[package]] +name = "elasticsearch" +version = "8.15.1" +description = "Python client for Elasticsearch" +optional = false +python-versions = ">=3.8" +files = [ + {file = "elasticsearch-8.15.1-py3-none-any.whl", hash = "sha256:02a0476e98768a30d7926335fc0d305c04fdb928eea1354c6e6040d8c2814569"}, + {file = "elasticsearch-8.15.1.tar.gz", hash = "sha256:40c0d312f8adf8bdc81795bc16a0b546ddf544cb1f90e829a244e4780c4dbfd8"}, +] + +[package.dependencies] +elastic-transport = ">=8.13,<9" + +[package.extras] +async = ["aiohttp (>=3,<4)"] +dev = ["aiohttp", "black", "build", "coverage", "isort", "jinja2", "mapbox-vector-tile", "nox", "numpy", "orjson", "pandas", "pyarrow", "pytest", "pytest-asyncio", "pytest-cov", "python-dateutil", "pyyaml (>=5.4)", "requests (>=2,<3)", "simsimd", "twine", "unasync"] +docs = ["sphinx", "sphinx-autodoc-typehints", "sphinx-rtd-theme (>=2.0)"] +orjson = ["orjson (>=3)"] +pyarrow = ["pyarrow (>=1)"] +requests = ["requests (>=2.4.0,!=2.32.2,<3.0.0)"] +vectorstore-mmr = ["numpy (>=1)", "simsimd (>=3)"] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -3302,4 +3343,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "58752f45e0f6eb32491f777be51f8e17a037ba109fe1c8a7995bea6683a80478" +content-hash = "154d8cf7bdfde94032dd2cbf56fdd6ba6e6dc4a2e9dbbfb94c11d3ad0e203551" diff --git a/pyproject.toml b/pyproject.toml index 87c8473..f061516 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ torch = "2.5.0" imblearn = "^0.0" iterative-stratification = "^0.1.9" pyyaml = "^6.0.2" +elasticsearch = "^8.15.1" [tool.poetry.group.dev.dependencies] pytest = "^8.3.2" diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py new file mode 100644 index 0000000..dbdf6de --- /dev/null +++ b/tests/test_pipeline.py @@ -0,0 +1,20 @@ +# TODO: add unit tests for all functions used in pipeline. The test should have 3 stages: 1. prepare for what needs to be tested,2. executing 3. check for actual vs expected values. +from data_discovery_ai.pipeline import pipeline + + +def test(): + item_description = """ + Ecological and taxonomic surveys of hermatypic scleractinian corals were carried out at approximately 100 sites around Lord Howe Island. Sixty-six of these sites were located on reefs in the lagoon, which extends for two-thirds of the length of the island on the western side. Each survey site consisted of a section of reef surface, which appeared to be topographically and faunistically homogeneous. The dimensions of the sites surveyed were generally of the order of 20m by 20m. Where possible, sites were arranged contiguously along a band up the reef slope and across the flat. The cover of each species was graded on a five-point scale of percentage relative cover. Other site attributes recorded were depth (minimum and maximum corrected to datum), slope (estimated), substrate type, total estimated cover of soft coral and algae (macroscopic and encrusting coralline). Coral data from the lagoon and its reef (66 sites) were used to define a small number of site groups which characterize most of this area.Throughout the survey, corals of taxonomic interest or difficulty were collected, and an extensive photographic record was made to augment survey data. A collection of the full range of form of all coral species was made during the survey and an identified reference series was deposited in the Australian Museum.In addition, less detailed descriptive data pertaining to coral communities and topography were recorded on 12 reconnaissance transects, the authors recording changes seen while being towed behind a boat. + The purpose of this study was to describe the corals of Lord Howe Island (the southernmost Indo-Pacific reef) at species and community level using methods that would allow differentiation of community types and allow comparisons with coral communities in other geographic locations. + """ + + pipeline( + isDataChanged=False, + usePretrainedModel=False, + description=item_description, + selected_model="development", + ) + + +if __name__ == "__main__": + test()