HalluciTrace/api/engine/extractor.py at main · httpsVishu/HalluciTrace

157 lines (131 loc) · 5.01 KB
extractor.py
------------
Extracts atomic factual claims from agent step output.
  openai   — GPT-4o-mini, structured prompt, retry on bad JSON
  fallback — heuristic sentence splitter, no API needed
from __future__ import annotations
import hashlib
import json
import time
from dataclasses import dataclass
from typing import Optional
class Claim:
    claim_id: str
    step_id: int
    text: str
    source: str    # "openai" | "fallback"
    def to_dict(self) -> dict:
        return {
            "claim_id": self.claim_id,
            "step_id":  self.step_id,
            "claim":    self.text,
            "source":   self.source,
def _make_id(step_id: int, text: str) -> str:
    return hashlib.md5(f"{step_id}:{text.strip().lower()}".encode()).hexdigest()[:12]
# ---------------------------------------------------------------------------
# Fallback heuristic extractor
# ---------------------------------------------------------------------------
_NON_FACTUAL = re.compile(
    r"^(i |we |let me |this |note:|warning:|todo|"
    r"in (this|our)|the (following|above|below)|"
    r"as (mentioned|noted|stated)|based on|according to our)",
    re.IGNORECASE,
_QUESTION    = re.compile(r"\?$")
_FACTUAL_SIG = re.compile(
    r"\b(is|are|was|were|has|have|had|will|does|did|"
    r"equals?|contains?|means?|represents?|indicates?|"
    r"results? in|leads? to|causes?|founded|established|"
    r"located|released|launched|acquired)\b",
    re.IGNORECASE,
_SENT_SPLIT  = re.compile(r"(?<=[.!?])\s+(?=[A-Z])")
def _fallback(text: str, step_id: int) -> list[Claim]:
    text = text.strip()
    if not text:
        return []
    sentences: list[str] = []
    for chunk in _SENT_SPLIT.split(text):
        sentences.extend(chunk.splitlines())
    claims: list[Claim] = []
    seen:   set[str]    = set()
    for raw in sentences:
        s = raw.strip().rstrip(".")
        if not s or len(s.split()) < 4:
            continue
        if _QUESTION.search(s) or _NON_FACTUAL.match(s):
            continue
        if not _FACTUAL_SIG.search(s):
            continue
        norm = s.lower()
        if norm in seen:
            continue
        seen.add(norm)
        claims.append(Claim(_make_id(step_id, s), step_id, s, "fallback"))
    return claims
# ---------------------------------------------------------------------------
# OpenAI extractor
# ---------------------------------------------------------------------------
    "You are a factual claim extractor. "
    "Extract only atomic, verifiable factual claims. "
    "No opinions, instructions, hedged statements, or meta-commentary. "
    "Return a JSON array of strings only — no markdown, no explanation."
_USR  = "Text:\n{text}\n\nReturn only a JSON array of claim strings."
    "Your response was not valid JSON. "
    "Return ONLY a raw JSON array of strings. No fences, no explanation.\n\n"
    "Text:\n{text}"
def _strip_fences(s: str) -> str:
    s = s.strip()
    s = re.sub(r"^```(?:json)?\s*", "", s, flags=re.IGNORECASE)
    s = re.sub(r"\s*```$", "", s)
    return s.strip()
def _parse(content: str) -> list[str] | None:
        parsed = json.loads(_strip_fences(content))
        if isinstance(parsed, list):
            return [str(x).strip() for x in parsed if str(x).strip()]
    except json.JSONDecodeError:
        pass
    return None
def _openai_extract(text: str, step_id: int, client) -> list[Claim]:
        r = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "system", "content": _SYS},
                      {"role": "user",   "content": _USR.format(text=text)}],
            temperature=0, max_tokens=1024,
        content = r.choices[0].message.content or ""
        parsed  = _parse(content)
        if parsed is not None:
            return [Claim(_make_id(step_id, c), step_id, c, "openai") for c in parsed]
        # One retry with stricter prompt
        time.sleep(0.4)
        r2 = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "system",    "content": _SYS},
                      {"role": "user",      "content": _USR.format(text=text)},
                      {"role": "assistant", "content": content},
                      {"role": "user",      "content": _RETRY.format(text=text)}],
            temperature=0, max_tokens=1024,
        parsed2 = _parse(r2.choices[0].message.content or "")
        if parsed2 is not None:
            return [Claim(_make_id(step_id, c), step_id, c, "openai") for c in parsed2]
        return _fallback(text, step_id)
    except Exception:
        return _fallback(text, step_id)
def extract_claims(text: str, step_id: int, client=None) -> list[Claim]:
    if not text or not text.strip():
        return []
    return _openai_extract(text, step_id, client) if client else _fallback(text, step_id)
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

extractor.py

Latest commit

History

extractor.py

File metadata and controls