Skip to content

Latest commit

Β 

History

History
298 lines (236 loc) Β· 9.6 KB

File metadata and controls

298 lines (236 loc) Β· 9.6 KB
title 🦜 LangChain
description Wrap ScrapeGraph endpoints as vanilla LangChain tools

Overview

Every ScrapeGraph v2 endpoint is one method on the official scrapegraph-py SDK. Wrap each one with LangChain's built-in @tool decorator and you get a fully typed toolkit β€” no extra dependency, no third-party integration package, full control over arguments and return shapes.

How LangChain's `@tool` decorator works The official Python SDK for ScrapeGraph v2

Installation

pip install langchain langchain-openai scrapegraph-py

Set your keys:

export SGAI_API_KEY="your-scrapegraph-key"
export OPENAI_API_KEY="your-openai-key"
Get your ScrapeGraph API key from the [dashboard](https://scrapegraphai.com/dashboard).

Build the toolkit

Save this once as sgai_tools.py β€” every example below imports from it.

from typing import Optional
from langchain_core.tools import tool
from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig, JsonFormatConfig

sgai = ScrapeGraphAI()  # reads SGAI_API_KEY from env

def _unwrap(result):
    """Return the SDK response payload as a plain dict."""
    if result.error:
        raise RuntimeError(f"ScrapeGraph error: {result.error}")
    data = result.data
    return data.model_dump() if hasattr(data, "model_dump") else data

# --- content endpoints -------------------------------------------------------

@tool
def scrape(url: str) -> dict:
    """Fetch a web page and return its content as markdown."""
    return _unwrap(sgai.scrape(url=url, formats=[MarkdownFormatConfig()]))

@tool
def extract(url: str, prompt: str) -> dict:
    """Extract structured data from a web page using a natural-language prompt."""
    return _unwrap(sgai.extract(prompt=prompt, url=url))

@tool
def search(query: str, num_results: int = 3) -> dict:
    """Run an AI web search; returns ranked results with fetched content."""
    return _unwrap(sgai.search(query=query, num_results=num_results))

# --- crawl (async job) -------------------------------------------------------

@tool
def crawl_start(url: str, max_depth: int = 2, max_pages: int = 10) -> dict:
    """Start a multi-page crawl job. Returns a dict including the crawl `id`."""
    return _unwrap(sgai.crawl.start(
        url=url, max_depth=max_depth, max_pages=max_pages,
        formats=[MarkdownFormatConfig()],
    ))

@tool
def crawl_get(crawl_id: str) -> dict:
    """Fetch the status and result of a crawl job."""
    return _unwrap(sgai.crawl.get(crawl_id))

@tool
def crawl_stop(crawl_id: str) -> dict:
    """Stop a running crawl."""
    return _unwrap(sgai.crawl.stop(crawl_id))

@tool
def crawl_resume(crawl_id: str) -> dict:
    """Resume a stopped crawl."""
    return _unwrap(sgai.crawl.resume(crawl_id))

@tool
def crawl_delete(crawl_id: str) -> dict:
    """Delete a crawl job."""
    return _unwrap(sgai.crawl.delete(crawl_id))

# --- monitor (scheduled jobs) ------------------------------------------------

@tool
def monitor_create(url: str, interval: str, name: Optional[str] = None, prompt: Optional[str] = None) -> dict:
    """Create a scheduled monitor. If `prompt` is given each tick stores JSON
    extraction; otherwise it stores markdown. `interval` is cron syntax,
    e.g. "0 9 * * *" for daily at 9am."""
    formats = [JsonFormatConfig(prompt=prompt)] if prompt else [MarkdownFormatConfig()]
    return _unwrap(sgai.monitor.create(url=url, interval=interval, name=name, formats=formats))

@tool
def monitor_list() -> list:
    """List all monitors."""
    return _unwrap(sgai.monitor.list())

@tool
def monitor_get(monitor_id: str) -> dict:
    """Get one monitor by id."""
    return _unwrap(sgai.monitor.get(monitor_id))

@tool
def monitor_pause(monitor_id: str) -> dict:
    """Pause a monitor."""
    return _unwrap(sgai.monitor.pause(monitor_id))

@tool
def monitor_resume(monitor_id: str) -> dict:
    """Resume a paused monitor."""
    return _unwrap(sgai.monitor.resume(monitor_id))

@tool
def monitor_delete(monitor_id: str) -> dict:
    """Delete a monitor."""
    _unwrap(sgai.monitor.delete(monitor_id))
    return {"deleted": monitor_id}

@tool
def monitor_activity(monitor_id: str) -> dict:
    """Get the recent runs of a monitor."""
    return _unwrap(sgai.monitor.activity(monitor_id))

# --- account / history -------------------------------------------------------

@tool
def history_list(service: Optional[str] = None, page: int = 1, limit: int = 20) -> dict:
    """List recent API request history, optionally filtered by service."""
    return _unwrap(sgai.history.list(service=service, page=page, limit=limit))

@tool
def history_get(request_id: str) -> dict:
    """Get a single history entry by request id."""
    return _unwrap(sgai.history.get(request_id))

@tool
def credits() -> dict:
    """Check remaining ScrapeGraph API credits."""
    return _unwrap(sgai.credits())

ALL_TOOLS = [
    scrape, extract, search,
    crawl_start, crawl_get, crawl_stop, crawl_resume, crawl_delete,
    monitor_create, monitor_list, monitor_get,
    monitor_pause, monitor_resume, monitor_delete, monitor_activity,
    history_list, history_get, credits,
]

Endpoint β†’ tool reference

ScrapeGraph endpoint SDK call LangChain tool
POST /scrape sgai.scrape(url=...) scrape
POST /extract sgai.extract(prompt=..., url=...) extract
POST /search sgai.search(query=...) search
POST /crawl sgai.crawl.start(url=...) crawl_start
GET /crawl/{id} sgai.crawl.get(id) crawl_get
POST /crawl/{id}/stop sgai.crawl.stop(id) crawl_stop
POST /crawl/{id}/resume sgai.crawl.resume(id) crawl_resume
DELETE /crawl/{id} sgai.crawl.delete(id) crawl_delete
POST /monitor sgai.monitor.create(url=..., interval=...) monitor_create
GET /monitor sgai.monitor.list() monitor_list
GET /monitor/{id} sgai.monitor.get(id) monitor_get
POST /monitor/{id}/pause sgai.monitor.pause(id) monitor_pause
POST /monitor/{id}/resume sgai.monitor.resume(id) monitor_resume
DELETE /monitor/{id} sgai.monitor.delete(id) monitor_delete
GET /monitor/{id}/activity sgai.monitor.activity(id) monitor_activity
GET /history sgai.history.list(...) history_list
GET /history/{id} sgai.history.get(id) history_get
GET /credits sgai.credits() credits

Direct invocation

Call any tool by itself without an LLM β€” useful for scripts, tests, or as a building block inside chains.

from sgai_tools import scrape, extract, search, credits, crawl_start, crawl_get

print(credits.invoke({}))
print(scrape.invoke({"url": "https://example.com"}))
print(extract.invoke({
    "url": "https://scrapegraphai.com",
    "prompt": "Extract the company name and a short description",
}))
print(search.invoke({"query": "best AI scraping tools 2026", "num_results": 3}))

job = crawl_start.invoke({"url": "https://scrapegraphai.com", "max_depth": 1, "max_pages": 5})
print(crawl_get.invoke({"crawl_id": job["id"]}))

Tool-calling agent

Give the LLM the whole toolkit and let it pick. LangChain v1's create_agent works with any chat model that supports tool calling (ChatOpenAI, ChatAnthropic, etc.).

from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from sgai_tools import ALL_TOOLS

llm = ChatOpenAI(model="gpt-4o", temperature=0)
agent = create_agent(
    model=llm,
    tools=ALL_TOOLS,
    system_prompt="You are a web research agent. Use ScrapeGraph tools to gather and extract web data.",
)

result = agent.invoke({
    "messages": [("user", "Find the pricing page of scrapegraphai.com and list the plan names and prices.")],
})
print(result["messages"][-1].content)
`create_agent` returns a compiled LangGraph under the hood β€” see the [LangGraph page](/integrations/langgraph) for advanced patterns (custom `StateGraph`, `ToolNode`, checkpointing).

Structured output with Pydantic

extract already returns structured JSON under the json_data key. Validate it into a Pydantic model for type safety downstream.

from pydantic import BaseModel, Field
from sgai_tools import extract

class Company(BaseModel):
    name: str = Field(description="Company name")
    tagline: str = Field(description="One-line description of what they do")

result = extract.invoke({
    "url": "https://scrapegraphai.com",
    "prompt": "Return an object with 'name' and 'tagline' describing the company",
})
company = Company(**result["json_data"])
print(company)

Chain pattern

Compose tools with LCEL when the sequence is fixed.

from sgai_tools import search, extract

def _search_then_extract(query: str) -> dict:
    hits = search.invoke({"query": query, "num_results": 1})
    top_url = hits["results"][0]["url"]
    return extract.invoke({"url": top_url, "prompt": "Summarise this page in 3 bullet points"})

print(_search_then_extract("scrapegraphai documentation"))

Support

Source and issues for scrapegraph-py Get help from our community