Merge branch 'main' into misc/UN-3396-MISC_design_rules_prototype

muhammad-ali-e · web-flow · commit 2459e8705113 · 2026-04-08T17:09:39.000+05:30
diff --git a/backend/prompt_studio/prompt_studio_core_v2/internal_urls.py b/backend/prompt_studio/prompt_studio_core_v2/internal_urls.py
@@ -10,6 +10,11 @@
     path("output/", internal_views.prompt_output, name="prompt-output"),
     path("index/", internal_views.index_update, name="index-update"),
     path("indexing-status/", internal_views.indexing_status, name="indexing-status"),
+    path(
+        "extraction-status/",
+        internal_views.extraction_status,
+        name="extraction-status",
+    ),
     path(
         "profile/<str:profile_id>/",
         internal_views.profile_detail,
diff --git a/backend/prompt_studio/prompt_studio_core_v2/internal_views.py b/backend/prompt_studio/prompt_studio_core_v2/internal_views.py
@@ -154,6 +154,73 @@ def index_update(request):
         )
 
 
+@csrf_exempt
+@require_http_methods(["POST"])
+def extraction_status(request):
+    """Mark IndexManager.extraction_status for a document+profile pair.
+
+    Called by the ide_callback worker after a successful ide_index run so
+    that subsequent Answer Prompt dispatches can short-circuit extraction
+    via PromptStudioIndexHelper.check_extraction_status.
+
+    Expected JSON payload:
+    {
+        "document_id": str,
+        "profile_manager_id": str,
+        "x2text_config_hash": str,
+        "enable_highlight": bool,
+        "extracted": bool (optional, default true),
+        "error_message": str | null (optional)
+    }
+    """
+    data, err = _parse_json_body(request)
+    if err:
+        return err
+
+    document_id = data.get("document_id", "")
+    profile_manager_id = data.get("profile_manager_id", "")
+    x2text_config_hash = data.get("x2text_config_hash", "")
+    enable_highlight = data.get("enable_highlight", False)
+    extracted = data.get("extracted", True)
+    error_message = data.get("error_message")
+
+    if not document_id or not profile_manager_id or not x2text_config_hash:
+        return JsonResponse(
+            {
+                "success": False,
+                "error": (
+                    "document_id, profile_manager_id, and x2text_config_hash "
+                    "are required"
+                ),
+            },
+            status=status.HTTP_400_BAD_REQUEST,
+        )
+
+    try:
+        from prompt_studio.prompt_profile_manager_v2.models import ProfileManager
+        from prompt_studio.prompt_studio_index_manager_v2.prompt_studio_index_helper import (
+            PromptStudioIndexHelper,
+        )
+
+        profile_manager = ProfileManager.objects.get(pk=profile_manager_id)
+        success = PromptStudioIndexHelper.mark_extraction_status(
+            document_id=document_id,
+            profile_manager=profile_manager,
+            x2text_config_hash=x2text_config_hash,
+            enable_highlight=enable_highlight,
+            extracted=extracted,
+            error_message=error_message,
+        )
+        return JsonResponse({"success": success})
+
+    except Exception as e:
+        logger.exception("extraction_status internal API failed")
+        return JsonResponse(
+            {"success": False, "error": str(e)},
+            status=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        )
+
+
 @csrf_exempt
 @require_http_methods(["POST"])
 def indexing_status(request):
diff --git a/backend/prompt_studio/prompt_studio_index_manager_v2/prompt_studio_index_helper.py b/backend/prompt_studio/prompt_studio_index_manager_v2/prompt_studio_index_helper.py
@@ -97,11 +97,6 @@ def mark_extraction_status(
             with transaction.atomic():
                 document = DocumentManager.objects.get(pk=document_id)
 
-                args = {
-                    "document_manager": document,
-                    "profile_manager": profile_manager,
-                }
-
                 # Build extraction status data
                 status_data = {
                     "extracted": extracted,
@@ -112,13 +107,23 @@ def mark_extraction_status(
                 if not extracted and error_message:
                     status_data["error"] = error_message
 
-                defaults = {"extraction_status": {x2text_config_hash: status_data}}
-
-                index_manager, created = IndexManager.objects.update_or_create(
-                    **args,
-                    defaults=defaults,
+                # Lock the row (or create an empty one) so concurrent callers
+                # merge into the same dict rather than clobbering each other.
+                index_manager, created = (
+                    IndexManager.objects.select_for_update().get_or_create(
+                        document_manager=document,
+                        profile_manager=profile_manager,
+                        defaults={"extraction_status": {}},
+                    )
                 )
 
+                # Merge in place — update_or_create(defaults=...) would replace
+                # the whole dict and wipe any prior hash entries.
+                extraction_status = dict(index_manager.extraction_status or {})
+                extraction_status[x2text_config_hash] = status_data
+                index_manager.extraction_status = extraction_status
+                index_manager.save(update_fields=["extraction_status"])
+
                 logger.info(
                     f"Index manager {index_manager} {index_manager.index_ids_history}"
                 )
diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py
@@ -1026,9 +1026,20 @@ def _handle_index(self, context: ExecutionContext) -> ExecutionResult:
                 doc_id_found,
                 reindex,
             )
+            if doc_id_found and not reindex:
+                shim.stream_log(
+                    "Document already indexed in vector store; skipping re-index."
+                )
+                logger.info(
+                    "Skipping re-index: doc_id=%s already in vector DB and "
+                    "reindex=False",
+                    doc_id,
+                )
+                return ExecutionResult(success=True, data={IKeys.DOC_ID: doc_id})
+
             if doc_id_found and reindex:
                 shim.stream_log("Document already indexed, re-indexing...")
-            elif not doc_id_found:
+            else:
                 shim.stream_log("Indexing document for the first time...")
             shim.stream_log("Indexing document into vector store...")
             index.perform_indexing(
diff --git a/workers/ide_callback/tasks.py b/workers/ide_callback/tasks.py
@@ -211,6 +211,31 @@ def ide_index_complete(
                     profile_manager_id,
                 )
 
+        # Mark extraction_status so subsequent Answer Prompt dispatches
+        # can short-circuit re-extraction. The Phase 4 backend payload
+        # already stashes x2text_config_hash and enable_highlight in
+        # cb_kwargs for exactly this purpose. Failure here is non-fatal:
+        # primary indexing already succeeded above.
+        x2text_config_hash = cb.get("x2text_config_hash", "")
+        enable_highlight = cb.get("enable_highlight", False)
+        if x2text_config_hash and profile_manager_id:
+            try:
+                api.mark_extraction_status(
+                    document_id=document_id,
+                    profile_manager_id=profile_manager_id,
+                    x2text_config_hash=x2text_config_hash,
+                    enable_highlight=enable_highlight,
+                    organization_id=org_id,
+                )
+            except Exception:
+                logger.warning(
+                    "Failed to mark extraction_status for document %s "
+                    "profile %s; primary indexing succeeded.",
+                    document_id,
+                    profile_manager_id,
+                    exc_info=True,
+                )
+
         # Handle summary index tracking via backend endpoint
         # (requires PromptIdeBaseTool + IndexingUtils which need Django ORM)
         summary_profile_id = cb.get("summary_profile_id", "")
diff --git a/workers/shared/clients/prompt_studio_client.py b/workers/shared/clients/prompt_studio_client.py
@@ -15,6 +15,7 @@
 _OUTPUT_ENDPOINT = "v1/prompt-studio/output/"
 _INDEX_ENDPOINT = "v1/prompt-studio/index/"
 _INDEXING_STATUS_ENDPOINT = "v1/prompt-studio/indexing-status/"
+_EXTRACTION_STATUS_ENDPOINT = "v1/prompt-studio/extraction-status/"
 _PROFILE_ENDPOINT = "v1/prompt-studio/profile/{profile_id}/"
 _HUBSPOT_ENDPOINT = "v1/prompt-studio/hubspot-notify/"
 _SUMMARY_INDEX_KEY_ENDPOINT = "v1/prompt-studio/summary-index-key/"
@@ -71,6 +72,33 @@ def update_index_manager(
         }
         return self.post(_INDEX_ENDPOINT, data=payload, organization_id=organization_id)
 
+    def mark_extraction_status(
+        self,
+        document_id: str,
+        profile_manager_id: str,
+        x2text_config_hash: str,
+        enable_highlight: bool,
+        organization_id: str | None = None,
+        extracted: bool = True,
+        error_message: str | None = None,
+    ) -> dict[str, Any]:
+        """Mark IndexManager.extraction_status for a document+profile pair.
+
+        Called from the ide_index_complete callback so that subsequent
+        Answer Prompt dispatches can short-circuit re-extraction.
+        """
+        payload = {
+            "document_id": document_id,
+            "profile_manager_id": profile_manager_id,
+            "x2text_config_hash": x2text_config_hash,
+            "enable_highlight": enable_highlight,
+            "extracted": extracted,
+            "error_message": error_message,
+        }
+        return self.post(
+            _EXTRACTION_STATUS_ENDPOINT, data=payload, organization_id=organization_id
+        )
+
     def mark_document_indexed(
         self,
         org_id: str,
diff --git a/workers/tests/test_legacy_executor_index.py b/workers/tests/test_legacy_executor_index.py
@@ -220,6 +220,38 @@ def test_reindex_passed_through(self, mock_get_fs, mock_indexing_deps):
         assert result.success is True
         init_call = mock_index_cls.call_args
         assert init_call.kwargs["processing_options"].reindex is True
+        # reindex=True with already-indexed doc must still call perform_indexing
+        mock_index_cls.return_value.perform_indexing.assert_called_once()
+
+    @patch(_PATCH_FS)
+    def test_already_indexed_no_reindex_short_circuits(
+        self, mock_get_fs, mock_indexing_deps
+    ):
+        """doc_id already in VDB and reindex=False → skip perform_indexing.
+
+        This is the defense-in-depth guard introduced for the IDE
+        re-indexing fix: even if the Redis cache misses and Answer Prompt
+        re-dispatches index, the executor must not re-write the same chunks
+        into the vector store.
+        """
+        mock_index_cls, mock_emb_cls, mock_vdb_cls = mock_indexing_deps
+        _register_legacy()
+        executor = ExecutorRegistry.get("legacy")
+
+        mock_index = _setup_mock_index(mock_index_cls, "doc-already-indexed")
+        mock_index.is_document_indexed.return_value = True
+        mock_emb_cls.return_value = MagicMock()
+        mock_vdb_cls.return_value = MagicMock()
+        mock_get_fs.return_value = MagicMock()
+
+        # reindex defaults to False
+        ctx = _make_index_context()
+        result = executor.execute(ctx)
+
+        assert result.success is True
+        assert result.data[IKeys.DOC_ID] == "doc-already-indexed"
+        mock_index.is_document_indexed.assert_called_once()
+        mock_index.perform_indexing.assert_not_called()
 
 
 # --- 5. VectorDB.close() always called ---