from __future__ import annotations import sys import time from abc import ABC, abstractmethod import anthropic from felderize.config import Config class LLMClient(ABC): @abstractmethod def translate(self, system_prompt: str, user_prompt: str) -> str: """Send a translation request and return the raw response text.""" class AnthropicClient(LLMClient): def __init__(self, config: Config): self.client = anthropic.Anthropic( api_key=config.api_key, base_url=config.base_url ) self.model = config.model def translate(self, system_prompt: str, user_prompt: str) -> str: for attempt in range(5): try: response = self.client.messages.create( model=self.model, max_tokens=4096, system=[ { "type": "text", "text": system_prompt, "cache_control": {"type": "ephemeral"}, } ], messages=[{"role": "user", "content": user_prompt}], ) u = response.usage print( f" llm: input={u.input_tokens} " f"cache_read={getattr(u, 'cache_read_input_tokens', 0)} " f"cache_write={getattr(u, 'cache_creation_input_tokens', 0)} " f"output={u.output_tokens}", file=sys.stderr, ) return response.content[0].text except anthropic.RateLimitError: if attempt == 4: raise wait = 60 * (attempt + 1) print(f"Rate limited — waiting {wait}s before retry...", flush=True) time.sleep(wait) raise AssertionError("unreachable") def create_client(config: Config) -> LLMClient: return AnthropicClient(config)