Merge branch 'main' into feat/remote-storage-workflow

Zipstack · Nov 15, 2024 · 27619f5 · 27619f5
2 parents 5c61fe3 + 2003289
commit 27619f5
Show file tree

Hide file tree

Showing 14 changed files with 60 additions and 69 deletions.
diff --git a/backend/prompt_studio/prompt_studio_core_v2/constants.py b/backend/prompt_studio/prompt_studio_core_v2/constants.py
@@ -94,6 +94,7 @@ class ToolStudioPromptKeys:
     SUMMARIZE_AS_SOURCE = "summarize_as_source"
     VARIABLE_MAP = "variable_map"
     RECORD = "record"
+    ENABLE_HIGHLIGHT = "enable_highlight"
 
 
 class FileViewTypes:

diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
@@ -810,6 +810,7 @@ def _fetch_response(
         tool_settings[TSPKeys.PREAMBLE] = tool.preamble
         tool_settings[TSPKeys.POSTAMBLE] = tool.postamble
         tool_settings[TSPKeys.GRAMMAR] = grammar_list
+        tool_settings[TSPKeys.ENABLE_HIGHLIGHT] = tool.enable_highlight
         tool_settings[TSPKeys.PLATFORM_POSTAMBLE] = getattr(
             settings, TSPKeys.PLATFORM_POSTAMBLE.upper(), ""
         )
@@ -1068,6 +1069,7 @@ def _fetch_single_pass_response(
         tool_settings[TSPKeys.CHUNK_SIZE] = default_profile.chunk_size
         tool_settings[TSPKeys.CHUNK_OVERLAP] = default_profile.chunk_overlap
         tool_settings[TSPKeys.ENABLE_CHALLENGE] = tool.enable_challenge
+        tool_settings[TSPKeys.ENABLE_HIGHLIGHT] = tool.enable_highlight
         tool_settings[TSPKeys.CHALLENGE_LLM] = challenge_llm
 
         for prompt in prompts:

diff --git a/backend/prompt_studio/prompt_studio_output_manager/output_manager_helper.py b/backend/prompt_studio/prompt_studio_output_manager/output_manager_helper.py
@@ -152,7 +152,7 @@ def update_or_create_prompt_output(
                 output=output,
                 eval_metrics=eval_metrics,
                 tool=tool,
-                context=context,
+                context=json.dumps(context),
                 challenge_data=challenge_data,
             )
 

diff --git a/backend/prompt_studio/prompt_studio_output_manager/serializers.py b/backend/prompt_studio/prompt_studio_output_manager/serializers.py
@@ -1,3 +1,4 @@
+import json
 import logging
 
 from usage.helper import UsageHelper
@@ -25,4 +26,10 @@ def to_representation(self, instance):
             )
             token_usage = {}
         data["token_usage"] = token_usage
+        # Convert string to list
+        try:
+            data["context"] = json.loads(data["context"])
+        except json.JSONDecodeError:
+            # Convert the old value of data["context"] to a list
+            data["context"] = [data["context"]]
         return data
diff --git a/backend/prompt_studio/prompt_studio_output_manager_v2/output_manager_helper.py b/backend/prompt_studio/prompt_studio_output_manager_v2/output_manager_helper.py
@@ -154,7 +154,7 @@ def update_or_create_prompt_output(
                 output=output,
                 eval_metrics=eval_metrics,
                 tool=tool,
-                context=context,
+                context=json.dumps(context),
                 challenge_data=challenge_data,
             )
 

diff --git a/backend/prompt_studio/prompt_studio_output_manager_v2/serializers.py b/backend/prompt_studio/prompt_studio_output_manager_v2/serializers.py
@@ -1,3 +1,4 @@
+import json
 import logging
 
 from usage_v2.helper import UsageHelper
@@ -25,4 +26,10 @@ def to_representation(self, instance):
             )
             token_usage = {}
         data["token_usage"] = token_usage
+        # Convert string to list
+        try:
+            data["context"] = json.loads(data["context"])
+        except json.JSONDecodeError:
+            # Convert the old value of data["context"] to a list
+            data["context"] = [data["context"]]
         return data
diff --git a/backend/prompt_studio/prompt_studio_registry_v2/prompt_studio_registry_helper.py b/backend/prompt_studio/prompt_studio_registry_v2/prompt_studio_registry_helper.py
@@ -73,6 +73,12 @@ def frame_spec(tool: CustomTool) -> Spec:
                 "default": False,
                 "description": "Enables SinglePass Extraction",
             },
+            "enable_highlight": {
+                "type": "boolean",
+                "title": "Enable highlight",
+                "default": False,
+                "description": "Enables highlight",
+            },
         }
 
         spec = Spec(

diff --git a/frontend/src/components/custom-tools/prompt-card/OutputForIndex.jsx b/frontend/src/components/custom-tools/prompt-card/OutputForIndex.jsx
@@ -16,19 +16,7 @@ function OutputForIndex({ chunkData, setIsIndexOpen, isIndexOpen }) {
   const activeRef = useRef(null);
 
   useEffect(() => {
-    if (!chunkData) {
-      setChunks([]);
-    }
-    // Split chunkData into chunks using \f\n delimiter
-    const tempChunks = chunkData?.split("\f\n");
-    // To remove " at the end
-    if (tempChunks?.length > 0) {
-      const lastChunk = tempChunks[tempChunks?.length - 1].trim();
-      if (lastChunk === '\\n"' || lastChunk === "") {
-        tempChunks.pop();
-      }
-    }
-    setChunks(tempChunks);
+    setChunks(chunkData || []);
   }, [chunkData]);
 
   // Debounced search handler

diff --git a/frontend/src/hooks/useSessionValid.js b/frontend/src/hooks/useSessionValid.js
@@ -19,15 +19,6 @@ try {
   // Plugin not available
 }
 
-// Import useGoogleTagManager hook
-let hsSignupEvent;
-try {
-  hsSignupEvent =
-    require("../plugins/hooks/useGoogleTagManager.js").useGoogleTagManager();
-} catch {
-  // Ignore if hook not available
-}
-
 let selectedProduct;
 let selectedProductStore;
 let PRODUCT_NAMES = {};
@@ -120,11 +111,6 @@ function useSessionValid() {
         }
       });
 
-      const isNewOrg = setOrgRes?.data?.is_new_org || false;
-      if (isNewOrg && hsSignupEvent) {
-        hsSignupEvent();
-      }
-
       userAndOrgDetails = setOrgRes?.data?.user;
       userAndOrgDetails["orgName"] = setOrgRes?.data?.organization?.name;
       userAndOrgDetails["orgId"] = orgId;

diff --git a/frontend/src/routes/Router.jsx b/frontend/src/routes/Router.jsx
@@ -113,20 +113,20 @@ function Router() {
 
         {/* protected routes */}
         <Route path="setOrg" element={<SetOrgPage />} />
+        {SelectProduct && (
+          <Route path="selectProduct" element={<SelectProduct />} />
+        )}
+        {TrialRoutes && (
+          <Route path="/trial-expired" element={<TrialRoutes />} />
+        )}
+        {PaymentSuccessful && (
+          <Route path="/payment/success" element={<PaymentSuccessful />} />
+        )}
         <Route path="" element={<RequireAuth />}>
           <Route path="">{MainAppRoute}</Route>
           {llmWhispererRouter && (
             <Route path="llm-whisperer">{llmWhispererRouter()}</Route>
           )}
-          {TrialRoutes && (
-            <Route path="/trial-expired" element={<TrialRoutes />} />
-          )}
-          {SelectProduct && (
-            <Route path="selectProduct" element={<SelectProduct />} />
-          )}
-          {PaymentSuccessful && (
-            <Route path="/payment/success" element={<PaymentSuccessful />} />
-          )}
         </Route>
       </Route>
 

diff --git a/prompt-service/src/unstract/prompt_service/helper.py b/prompt-service/src/unstract/prompt_service/helper.py
@@ -83,11 +83,11 @@ def plugin_loader(app: Flask) -> None:
     initialize_plugin_endpoints(app=app)
 
 
-def get_cleaned_context(context: str) -> str:
+def get_cleaned_context(context: set[str]) -> list[str]:
     clean_context_plugin: dict[str, Any] = plugins.get(PSKeys.CLEAN_CONTEXT, {})
     if clean_context_plugin:
         return clean_context_plugin["entrypoint_cls"].run(context=context)
-    return context
+    return list(context)
 
 
 def initialize_plugin_endpoints(app: Flask) -> None:

diff --git a/prompt-service/src/unstract/prompt_service/main.py b/prompt-service/src/unstract/prompt_service/main.py
@@ -251,10 +251,10 @@ def prompt_processor() -> Any:
                 raise api_error
 
         try:
-            context = ""
+            context: set[str] = set()
             if output[PSKeys.CHUNK_SIZE] == 0:
                 # We can do this only for chunkless indexes
-                context: Optional[str] = index.query_index(
+                retrieved_context: Optional[str] = index.query_index(
                     embedding_instance_id=output[PSKeys.EMBEDDING],
                     vector_db_instance_id=output[PSKeys.VECTOR_DB],
                     doc_id=doc_id,
@@ -270,13 +270,13 @@ def prompt_processor() -> Any:
                     # inconsistent, and not reproducible easily,
                     # this is just a safety net.
                     time.sleep(2)
-                    context: Optional[str] = index.query_index(
+                    retrieved_context: Optional[str] = index.query_index(
                         embedding_instance_id=output[PSKeys.EMBEDDING],
                         vector_db_instance_id=output[PSKeys.VECTOR_DB],
                         doc_id=doc_id,
                         usage_kwargs=usage_kwargs,
                     )
-                    if context is None:
+                    if retrieved_context is None:
                         # TODO: Obtain user set name for vector DB
                         msg = NO_CONTEXT_ERROR
                         app.logger.error(
@@ -294,6 +294,7 @@ def prompt_processor() -> Any:
                             msg,
                         )
                         raise APIError(message=msg)
+                context.add(retrieved_context)
                 # TODO: Use vectorDB name when available
                 publish_log(
                     log_events_id,
@@ -323,7 +324,7 @@ def prompt_processor() -> Any:
                     tool_settings=tool_settings,
                     output=output,
                     llm=llm,
-                    context=context,
+                    context="\n".join(context),
                     prompt="promptx",
                     metadata=metadata,
                 )
@@ -537,7 +538,7 @@ def prompt_processor() -> Any:
                             llm=llm,
                             challenge_llm=challenge_llm,
                             run_id=run_id,
-                            context=context,
+                            context="\n".join(context),
                             tool_settings=tool_settings,
                             output=output,
                             structured_output=structured_output,
@@ -593,7 +594,7 @@ def prompt_processor() -> Any:
                     try:
                         evaluator = eval_plugin["entrypoint_cls"](
                             "",
-                            context,
+                            "\n".join(context),
                             "",
                             "",
                             output,
@@ -680,7 +681,7 @@ def run_retrieval(  # type:ignore
     retrieval_type: str,
     metadata: dict[str, Any],
 ) -> tuple[str, str]:
-    context: str = ""
+    context: set[str] = set()
     prompt = output[PSKeys.PROMPTX]
     if retrieval_type == PSKeys.SUBQUESTION:
         subq_prompt: str = (
@@ -713,19 +714,11 @@ def run_retrieval(  # type:ignore
             prompt=subq_prompt,
         )
         subquestion_list = subquestions.split(",")
-        raw_retrieved_context = ""
         for each_subq in subquestion_list:
             retrieved_context = _retrieve_context(
                 output, doc_id, vector_index, each_subq
             )
-            # Not adding the potential for pinecode serverless
-            # inconsistency issue owing to risk of infinte loop
-            # and inablity to diffrentiate genuine cases of
-            # empty context.
-            raw_retrieved_context = "\f\n".join(
-                [raw_retrieved_context, retrieved_context]
-            )
-        context = _remove_duplicate_nodes(raw_retrieved_context)
+            context.update(retrieved_context)
 
     if retrieval_type == PSKeys.SIMPLE:
 
@@ -746,21 +739,15 @@ def run_retrieval(  # type:ignore
         tool_settings=tool_settings,
         output=output,
         llm=llm,
-        context=context,
+        context="\n".join(context),
         prompt="promptx",
         metadata=metadata,
     )
 
     return (answer, context)
 
 
-def _remove_duplicate_nodes(retrieved_context: str) -> str:
-    context_set: set[str] = set(retrieved_context.split("\f\n"))
-    fomatted_context = "\f\n".join(context_set)
-    return fomatted_context
-
-
-def _retrieve_context(output, doc_id, vector_index, answer) -> str:
+def _retrieve_context(output, doc_id, vector_index, answer) -> set[str]:
     retriever = vector_index.as_retriever(
         similarity_top_k=output[PSKeys.SIMILARITY_TOP_K],
         filters=MetadataFilters(
@@ -773,18 +760,18 @@ def _retrieve_context(output, doc_id, vector_index, answer) -> str:
         ),
     )
     nodes = retriever.retrieve(answer)
-    text = ""
+    context: set[str] = set()
     for node in nodes:
         # ToDo: May have to fine-tune this value for node score or keep it
         # configurable at the adapter level
         if node.score > 0:
-            text += node.get_content() + "\f\n"
+            context.add(node.get_content())
         else:
             app.logger.info(
                 "Node score is less than 0. "
                 f"Ignored: {node.node_id} with score {node.score}"
             )
-    return text
+    return context
 
 
 def log_exceptions(e: HTTPException):

diff --git a/tools/structure/src/constants.py b/tools/structure/src/constants.py
@@ -72,4 +72,5 @@ class SettingsKeys:
     METADATA = "metadata"
     EPILOGUE = "epilogue"
     HIGHLIGHT_DATA = "highlight_data"
+    CONFIDENCE_DATA = "confidence_data"
     EXECUTION_RUN_DATA_FOLDER = "EXECUTION_RUN_DATA_FOLDER"
diff --git a/tools/structure/src/main.py b/tools/structure/src/main.py
@@ -216,10 +216,16 @@ def run(
             epilogue = metadata.pop(SettingsKeys.EPILOGUE, None)
             if epilogue:
                 try:
-                    from helper import transform_dict  # type: ignore [attr-defined]
+                    from helper import (  # type: ignore [attr-defined]
+                        get_confidence_data,
+                        transform_dict,
+                    )
 
                     highlight_data = transform_dict(epilogue, tool_data_dir)
                     metadata[SettingsKeys.HIGHLIGHT_DATA] = highlight_data
+                    metadata[SettingsKeys.CONFIDENCE_DATA] = get_confidence_data(
+                        epilogue, tool_data_dir
+                    )
                 except ImportError:
                     self.stream_log(
                         f"Highlight metadata is not added. {PAID_FEATURE_MSG}",