| title | π¦ LangChain |
|---|---|
| description | Wrap ScrapeGraph endpoints as vanilla LangChain tools |
Every ScrapeGraph v2 endpoint is one method on the official scrapegraph-py SDK. Wrap each one with LangChain's built-in @tool decorator and you get a fully typed toolkit β no extra dependency, no third-party integration package, full control over arguments and return shapes.
pip install langchain langchain-openai scrapegraph-pySet your keys:
export SGAI_API_KEY="your-scrapegraph-key"
export OPENAI_API_KEY="your-openai-key"Save this once as sgai_tools.py β every example below imports from it.
from typing import Optional
from langchain_core.tools import tool
from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig, JsonFormatConfig
sgai = ScrapeGraphAI() # reads SGAI_API_KEY from env
def _unwrap(result):
"""Return the SDK response payload as a plain dict."""
if result.error:
raise RuntimeError(f"ScrapeGraph error: {result.error}")
data = result.data
return data.model_dump() if hasattr(data, "model_dump") else data
# --- content endpoints -------------------------------------------------------
@tool
def scrape(url: str) -> dict:
"""Fetch a web page and return its content as markdown."""
return _unwrap(sgai.scrape(url=url, formats=[MarkdownFormatConfig()]))
@tool
def extract(url: str, prompt: str) -> dict:
"""Extract structured data from a web page using a natural-language prompt."""
return _unwrap(sgai.extract(prompt=prompt, url=url))
@tool
def search(query: str, num_results: int = 3) -> dict:
"""Run an AI web search; returns ranked results with fetched content."""
return _unwrap(sgai.search(query=query, num_results=num_results))
# --- crawl (async job) -------------------------------------------------------
@tool
def crawl_start(url: str, max_depth: int = 2, max_pages: int = 10) -> dict:
"""Start a multi-page crawl job. Returns a dict including the crawl `id`."""
return _unwrap(sgai.crawl.start(
url=url, max_depth=max_depth, max_pages=max_pages,
formats=[MarkdownFormatConfig()],
))
@tool
def crawl_get(crawl_id: str) -> dict:
"""Fetch the status and result of a crawl job."""
return _unwrap(sgai.crawl.get(crawl_id))
@tool
def crawl_stop(crawl_id: str) -> dict:
"""Stop a running crawl."""
return _unwrap(sgai.crawl.stop(crawl_id))
@tool
def crawl_resume(crawl_id: str) -> dict:
"""Resume a stopped crawl."""
return _unwrap(sgai.crawl.resume(crawl_id))
@tool
def crawl_delete(crawl_id: str) -> dict:
"""Delete a crawl job."""
return _unwrap(sgai.crawl.delete(crawl_id))
# --- monitor (scheduled jobs) ------------------------------------------------
@tool
def monitor_create(url: str, interval: str, name: Optional[str] = None, prompt: Optional[str] = None) -> dict:
"""Create a scheduled monitor. If `prompt` is given each tick stores JSON
extraction; otherwise it stores markdown. `interval` is cron syntax,
e.g. "0 9 * * *" for daily at 9am."""
formats = [JsonFormatConfig(prompt=prompt)] if prompt else [MarkdownFormatConfig()]
return _unwrap(sgai.monitor.create(url=url, interval=interval, name=name, formats=formats))
@tool
def monitor_list() -> list:
"""List all monitors."""
return _unwrap(sgai.monitor.list())
@tool
def monitor_get(monitor_id: str) -> dict:
"""Get one monitor by id."""
return _unwrap(sgai.monitor.get(monitor_id))
@tool
def monitor_pause(monitor_id: str) -> dict:
"""Pause a monitor."""
return _unwrap(sgai.monitor.pause(monitor_id))
@tool
def monitor_resume(monitor_id: str) -> dict:
"""Resume a paused monitor."""
return _unwrap(sgai.monitor.resume(monitor_id))
@tool
def monitor_delete(monitor_id: str) -> dict:
"""Delete a monitor."""
_unwrap(sgai.monitor.delete(monitor_id))
return {"deleted": monitor_id}
@tool
def monitor_activity(monitor_id: str) -> dict:
"""Get the recent runs of a monitor."""
return _unwrap(sgai.monitor.activity(monitor_id))
# --- account / history -------------------------------------------------------
@tool
def history_list(service: Optional[str] = None, page: int = 1, limit: int = 20) -> dict:
"""List recent API request history, optionally filtered by service."""
return _unwrap(sgai.history.list(service=service, page=page, limit=limit))
@tool
def history_get(request_id: str) -> dict:
"""Get a single history entry by request id."""
return _unwrap(sgai.history.get(request_id))
@tool
def credits() -> dict:
"""Check remaining ScrapeGraph API credits."""
return _unwrap(sgai.credits())
ALL_TOOLS = [
scrape, extract, search,
crawl_start, crawl_get, crawl_stop, crawl_resume, crawl_delete,
monitor_create, monitor_list, monitor_get,
monitor_pause, monitor_resume, monitor_delete, monitor_activity,
history_list, history_get, credits,
]| ScrapeGraph endpoint | SDK call | LangChain tool |
|---|---|---|
POST /scrape |
sgai.scrape(url=...) |
scrape |
POST /extract |
sgai.extract(prompt=..., url=...) |
extract |
POST /search |
sgai.search(query=...) |
search |
POST /crawl |
sgai.crawl.start(url=...) |
crawl_start |
GET /crawl/{id} |
sgai.crawl.get(id) |
crawl_get |
POST /crawl/{id}/stop |
sgai.crawl.stop(id) |
crawl_stop |
POST /crawl/{id}/resume |
sgai.crawl.resume(id) |
crawl_resume |
DELETE /crawl/{id} |
sgai.crawl.delete(id) |
crawl_delete |
POST /monitor |
sgai.monitor.create(url=..., interval=...) |
monitor_create |
GET /monitor |
sgai.monitor.list() |
monitor_list |
GET /monitor/{id} |
sgai.monitor.get(id) |
monitor_get |
POST /monitor/{id}/pause |
sgai.monitor.pause(id) |
monitor_pause |
POST /monitor/{id}/resume |
sgai.monitor.resume(id) |
monitor_resume |
DELETE /monitor/{id} |
sgai.monitor.delete(id) |
monitor_delete |
GET /monitor/{id}/activity |
sgai.monitor.activity(id) |
monitor_activity |
GET /history |
sgai.history.list(...) |
history_list |
GET /history/{id} |
sgai.history.get(id) |
history_get |
GET /credits |
sgai.credits() |
credits |
Call any tool by itself without an LLM β useful for scripts, tests, or as a building block inside chains.
from sgai_tools import scrape, extract, search, credits, crawl_start, crawl_get
print(credits.invoke({}))
print(scrape.invoke({"url": "https://example.com"}))
print(extract.invoke({
"url": "https://scrapegraphai.com",
"prompt": "Extract the company name and a short description",
}))
print(search.invoke({"query": "best AI scraping tools 2026", "num_results": 3}))
job = crawl_start.invoke({"url": "https://scrapegraphai.com", "max_depth": 1, "max_pages": 5})
print(crawl_get.invoke({"crawl_id": job["id"]}))Give the LLM the whole toolkit and let it pick. LangChain v1's create_agent works with any chat model that supports tool calling (ChatOpenAI, ChatAnthropic, etc.).
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from sgai_tools import ALL_TOOLS
llm = ChatOpenAI(model="gpt-4o", temperature=0)
agent = create_agent(
model=llm,
tools=ALL_TOOLS,
system_prompt="You are a web research agent. Use ScrapeGraph tools to gather and extract web data.",
)
result = agent.invoke({
"messages": [("user", "Find the pricing page of scrapegraphai.com and list the plan names and prices.")],
})
print(result["messages"][-1].content)extract already returns structured JSON under the json_data key. Validate it into a Pydantic model for type safety downstream.
from pydantic import BaseModel, Field
from sgai_tools import extract
class Company(BaseModel):
name: str = Field(description="Company name")
tagline: str = Field(description="One-line description of what they do")
result = extract.invoke({
"url": "https://scrapegraphai.com",
"prompt": "Return an object with 'name' and 'tagline' describing the company",
})
company = Company(**result["json_data"])
print(company)Compose tools with LCEL when the sequence is fixed.
from sgai_tools import search, extract
def _search_then_extract(query: str) -> dict:
hits = search.invoke({"query": query, "num_results": 1})
top_url = hits["results"][0]["url"]
return extract.invoke({"url": top_url, "prompt": "Summarise this page in 3 bullet points"})
print(_search_then_extract("scrapegraphai documentation"))