From a6fc985c82755e87f00c4882d44a602cb41561ed Mon Sep 17 00:00:00 2001 From: zane-neo Date: Tue, 15 Oct 2024 12:27:26 +0800 Subject: [PATCH] Fix PR #2976 bug due to missing adding function_name and algorithm in querying models (#3104) * Fix PR #2976 bug due to missing adding function_name and algorithm in query models Signed-off-by: zane-neo * format code Signed-off-by: zane-neo * Move the setup mehtod body to test method body Signed-off-by: zane-neo --------- Signed-off-by: zane-neo (cherry picked from commit 5b982c41b348a9ea4c6c4a561ccd97e068f538bc) --- .../autoredeploy/MLModelAutoReDeployer.java | 18 ++++- .../autoredeploy/MLModelAutoReDeployerIT.java | 65 +++++++++++++++++++ .../autoredeploy/TracedSmallModelRequest.json | 14 ++++ 3 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 plugin/src/test/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployerIT.java create mode 100644 plugin/src/test/resources/org/opensearch/ml/autoredeploy/TracedSmallModelRequest.json diff --git a/plugin/src/main/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployer.java b/plugin/src/main/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployer.java index 4441a3d592..ac38ce24c6 100644 --- a/plugin/src/main/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployer.java +++ b/plugin/src/main/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployer.java @@ -217,7 +217,7 @@ private void triggerUndeployModelsOnDataNodes(List dataNodeIds) { client.execute(MLUndeployModelAction.INSTANCE, undeployModelNodesRequest, undeployModelListener); } } - }, e -> { log.error("Failed to query need undeploy models, no action will be performed"); }); + }, e -> { log.error("Failed to query need undeploy models, no action will be performed", e); }); queryRunningModels(listener); } @@ -241,7 +241,9 @@ private void queryRunningModels(ActionListener listener) { String[] includes = new String[] { MLModel.AUTO_REDEPLOY_RETRY_TIMES_FIELD, MLModel.PLANNING_WORKER_NODES_FIELD, - MLModel.DEPLOY_TO_ALL_NODES_FIELD }; + MLModel.DEPLOY_TO_ALL_NODES_FIELD, + MLModel.FUNCTION_NAME_FIELD, + MLModel.ALGORITHM_FIELD }; String[] excludes = new String[] { MLModel.MODEL_CONTENT_FIELD, MLModel.OLD_MODEL_CONTENT_FIELD }; FetchSourceContext fetchContext = new FetchSourceContext(true, includes, excludes); @@ -257,12 +259,24 @@ private void queryRunningModels(ActionListener listener) { @SuppressWarnings("unchecked") private void triggerModelRedeploy(ModelAutoRedeployArrangement modelAutoRedeployArrangement) { + if (modelAutoRedeployArrangement == null) { + log.info("No more models in arrangement, skipping the redeployment"); + return; + } String modelId = modelAutoRedeployArrangement.getSearchResponse().getId(); List addedNodes = modelAutoRedeployArrangement.getAddedNodes(); Map sourceAsMap = modelAutoRedeployArrangement.getSearchResponse().getSourceAsMap(); String functionName = (String) Optional .ofNullable(sourceAsMap.get(MLModel.FUNCTION_NAME_FIELD)) .orElse(sourceAsMap.get(MLModel.ALGORITHM_FIELD)); + if (functionName == null) { + log + .error( + "Model function_name or algorithm is null, model is not in correct status, please check the model, model id is: {}", + modelId + ); + return; + } if (FunctionName.REMOTE == FunctionName.from(functionName)) { log.info("Skipping redeploying remote model {} as remote model deployment can be done at prediction time.", modelId); return; diff --git a/plugin/src/test/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployerIT.java b/plugin/src/test/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployerIT.java new file mode 100644 index 0000000000..1ab8218d2e --- /dev/null +++ b/plugin/src/test/java/org/opensearch/ml/autoredeploy/MLModelAutoReDeployerIT.java @@ -0,0 +1,65 @@ +/* + * + * * Copyright OpenSearch Contributors + * * SPDX-License-Identifier: Apache-2.0 + * + */ + +package org.opensearch.ml.autoredeploy; + +import static org.opensearch.ml.common.MLTask.MODEL_ID_FIELD; + +import java.nio.file.Files; +import java.nio.file.Path; + +import org.opensearch.ml.common.MLTaskState; +import org.opensearch.ml.rest.MLCommonsRestTestCase; + +import lombok.SneakyThrows; + +public class MLModelAutoReDeployerIT extends MLCommonsRestTestCase { + + public void testModelAutoRedeploy() { + prepareModel(); + } + + @SneakyThrows + private void prepareModel() { + String requestBody = Files + .readString( + Path.of(this.getClass().getClassLoader().getResource("org/opensearch/ml/autoredeploy/TracedSmallModelRequest.json").toURI()) + ); + String registerFirstModelTaskId = registerModel(requestBody); + String registerSecondModelTaskId = registerModel(requestBody); + waitForTask(registerFirstModelTaskId, MLTaskState.COMPLETED); + getTask(client(), registerFirstModelTaskId, response -> { + String firstModelId = (String) response.get(MODEL_ID_FIELD); + try { + String deployFirstModelTaskId = deployModel(firstModelId); + getTask(client(), registerSecondModelTaskId, response1 -> { + String secondModelId = (String) response1.get(MODEL_ID_FIELD); + try { + /** + * At this time point, the model auto redeployer should be querying the deploying/deploy failed/partially deployed models. + * The original deploy model task should be able to complete successfully, if not it means the + * org.opensearch.ml.action.forward.TransportForwardAction.triggerNextModelDeployAndCheckIfRestRetryTimes might throw exception + * which cause by org.opensearch.ml.autoredeploy.MLModelAutoReDeployer#redeployAModel. The auto redeploy constructs an arrangement + * with two models, the first model deploy done event will trigger the auto redeploy's next model deploy, and if during this + * any error occurs, the first model deploy task status won't be updated to complete. So if this IT can pass, then it means the + * next model auto redeploy trigger is correct. + */ + String deploySecondModelTaskId = deployModel(secondModelId); + waitForTask(deploySecondModelTaskId, MLTaskState.COMPLETED); + } catch (Exception e) { + fail(e.getMessage()); + } + }); + waitForTask(deployFirstModelTaskId, MLTaskState.COMPLETED); + } catch (Exception e) { + logger.error(e.getMessage(), e); + fail(e.getMessage()); + } + }); + } + +} diff --git a/plugin/src/test/resources/org/opensearch/ml/autoredeploy/TracedSmallModelRequest.json b/plugin/src/test/resources/org/opensearch/ml/autoredeploy/TracedSmallModelRequest.json new file mode 100644 index 0000000000..9fc53f3b91 --- /dev/null +++ b/plugin/src/test/resources/org/opensearch/ml/autoredeploy/TracedSmallModelRequest.json @@ -0,0 +1,14 @@ +{ + "name": "traced_small_model", + "version": "1.0.0", + "model_format": "TORCH_SCRIPT", + "model_task_type": "text_embedding", + "model_content_hash_value": "e13b74006290a9d0f58c1376f9629d4ebc05a0f9385f40db837452b167ae9021", + "model_config": { + "model_type": "bert", + "embedding_dimension": 768, + "framework_type": "sentence_transformers", + "all_config": "{\"architectures\":[\"BertModel\"],\"max_position_embeddings\":512,\"model_type\":\"bert\",\"num_attention_heads\":12,\"num_hidden_layers\":6}" + }, + "url": "https://github.com/opensearch-project/ml-commons/blob/2.x/ml-algorithms/src/test/resources/org/opensearch/ml/engine/algorithms/text_embedding/traced_small_model.zip?raw=true" +}